From f190b8c8452a8681480d5435804a6b2a5b760252 Mon Sep 17 00:00:00 2001 From: Tim Hall Date: Tue, 14 May 2024 15:34:45 +0100 Subject: [PATCH] Add support for Ethos-U85 - Added Ethos-U85 support via the --accelerator-config CLI option Change-Id: Ia3c77dbf61c1b7fa9cb03f8f51d336de2f115a3a Signed-off-by: Tim Hall --- .clang-format | 214 + .gitignore | 2 +- .pre-commit-config.yaml | 63 +- README.md | 10 +- TESTING.md | 90 +- THIRDPARTY.md | 34 + ethosu/config_files/Arm/vela.ini | 95 +- ethosu/mlw_codec/mlw_decode.c | 4 +- ethosu/regor/CMakeLists.txt | 355 + ethosu/regor/architecture/architecture.cpp | 221 + ethosu/regor/architecture/architecture.hpp | 391 + .../ethos_u_register_cs_generator.hpp | 66 + ethosu/regor/architecture/ethos_u_scaling.cpp | 69 + ethosu/regor/architecture/ethos_u_scaling.hpp | 37 + .../regor/architecture/ethosu55/ethos_u55.cpp | 932 + .../regor/architecture/ethosu55/ethos_u55.hpp | 239 + .../ethosu55/ethos_u55_performance.cpp | 531 + .../ethosu55/ethos_u55_performance.hpp | 60 + .../ethos_u55_register_cs_generator.cpp | 1469 + .../ethos_u55_register_cs_generator.hpp | 269 + .../ethosu55/ethos_u55_scaling.cpp | 325 + .../ethosu55/ethos_u55_scaling.hpp | 39 + .../ethosu55/ethos_u55_weight_encoder.cpp | 487 + .../ethosu55/ethos_u55_weight_encoder.hpp | 86 + .../regor/architecture/ethosu65/ethos_u65.cpp | 91 + .../regor/architecture/ethosu65/ethos_u65.hpp | 45 + .../ethosu65/ethos_u65_interface.hpp | 21699 ++++++++++++++ .../ethos_u65_register_cs_generator.cpp | 40 + .../ethos_u65_register_cs_generator.hpp | 41 + .../regor/architecture/ethosu85/ethos_u85.cpp | 1324 + .../regor/architecture/ethosu85/ethos_u85.hpp | 230 + .../ethosu85/ethos_u85_interface.hpp | 24312 ++++++++++++++++ .../ethosu85/ethos_u85_performance.cpp | 545 + .../ethosu85/ethos_u85_performance.hpp | 60 + .../ethos_u85_register_cs_generator.cpp | 1828 ++ .../ethos_u85_register_cs_generator.hpp | 250 + .../ethosu85/ethos_u85_scaling.cpp | 327 + .../ethosu85/ethos_u85_scaling.hpp | 38 + .../ethosu85/ethos_u85_weight_encoder.cpp | 775 + .../ethosu85/ethos_u85_weight_encoder.hpp | 91 + ethosu/regor/architecture/mlw_encode.cpp | 125 + ethosu/regor/architecture/mlw_encode.hpp | 54 + .../register_command_stream_generator.hpp | 41 + ethosu/regor/architecture/weight_encoder.hpp | 210 + ethosu/regor/bindings/python/py_regor.cpp | 587 + ethosu/regor/cmake/cpack_config.cmake | 83 + ethosu/regor/cmake/pkg-config.cmake.in | 26 + ethosu/regor/cmake/regor_dependencies.cmake | 82 + ethosu/regor/cmake/regor_lib.cmake | 312 + ethosu/regor/cmake/regor_options.cmake | 279 + ethosu/regor/cmake/regor_test.cmake | 123 + ethosu/regor/cmake/regor_thirdparty.cmake | 49 + ethosu/regor/cmake/toolchains/clang.cmake | 51 + ethosu/regor/cmake/toolchains/clang32.cmake | 61 + ethosu/regor/cmake/toolchains/gcc.cmake | 37 + ethosu/regor/cmake/toolchains/gcc32.cmake | 55 + ethosu/regor/cmake/utils.cmake | 75 + ethosu/regor/common/bit_flags.hpp | 332 + ethosu/regor/common/box.hpp | 63 + ethosu/regor/common/buffer_view.hpp | 445 + ethosu/regor/common/common.cpp | 29 + ethosu/regor/common/common.hpp | 172 + ethosu/regor/common/data_type.cpp | 72 + ethosu/regor/common/data_type.hpp | 290 + ethosu/regor/common/dynamic_typing.hpp | 255 + ethosu/regor/common/ini_reader.hpp | 300 + ethosu/regor/common/lexer.hpp | 159 + ethosu/regor/common/logging.cpp | 70 + ethosu/regor/common/logging.hpp | 200 + ethosu/regor/common/numeric_util.hpp | 412 + ethosu/regor/common/ordered_map.hpp | 1112 + ethosu/regor/common/reverse_type.cpp | 30 + ethosu/regor/common/reverse_type.hpp | 28 + ethosu/regor/common/scaling.cpp | 93 + ethosu/regor/common/scaling.hpp | 49 + ethosu/regor/common/shape.hpp | 733 + ethosu/regor/common/transpose_type.cpp | 33 + ethosu/regor/common/transpose_type.hpp | 78 + ethosu/regor/common/vector_span.hpp | 77 + ethosu/regor/compiler/attributes.cpp | 66 + ethosu/regor/compiler/attributes.hpp | 259 + ethosu/regor/compiler/cascade_builder.cpp | 399 + ethosu/regor/compiler/cascade_builder.hpp | 99 + ethosu/regor/compiler/compiler.cpp | 455 + ethosu/regor/compiler/compiler.hpp | 133 + ethosu/regor/compiler/database.hpp | 208 + .../regor/compiler/faststorage_allocator.cpp | 403 + .../regor/compiler/faststorage_allocator.hpp | 98 + ethosu/regor/compiler/graph.hpp | 172 + ethosu/regor/compiler/graph_builder.cpp | 530 + ethosu/regor/compiler/graph_builder.hpp | 99 + ethosu/regor/compiler/graph_optimiser.cpp | 341 + ethosu/regor/compiler/graph_optimiser.hpp | 212 + ethosu/regor/compiler/graph_optimiser_db.hpp | 60 + ethosu/regor/compiler/graph_packing.cpp | 249 + ethosu/regor/compiler/graph_packing.hpp | 60 + ethosu/regor/compiler/graph_validator.cpp | 54 + ethosu/regor/compiler/graph_validator.hpp | 59 + ethosu/regor/compiler/graphir_optimiser.cpp | 187 + ethosu/regor/compiler/graphir_optimiser.hpp | 107 + .../compiler/high_level_command_stream.hpp | 273 + .../high_level_command_stream_generator.cpp | 813 + .../high_level_command_stream_generator.hpp | 63 + ethosu/regor/compiler/hillclimb_allocator.cpp | 355 + ethosu/regor/compiler/hillclimb_allocator.hpp | 172 + ethosu/regor/compiler/kernel.hpp | 161 + ethosu/regor/compiler/live_range.cpp | 233 + ethosu/regor/compiler/live_range.hpp | 112 + ethosu/regor/compiler/network_performance.cpp | 317 + ethosu/regor/compiler/network_performance.hpp | 153 + ethosu/regor/compiler/op_type.cpp | 199 + ethosu/regor/compiler/op_type.hpp | 290 + ethosu/regor/compiler/operation.cpp | 168 + ethosu/regor/compiler/operation.hpp | 234 + ethosu/regor/compiler/operation_util.hpp | 242 + ethosu/regor/compiler/optimiser_utils.cpp | 143 + ethosu/regor/compiler/optimiser_utils.hpp | 55 + ethosu/regor/compiler/quantization.cpp | 59 + ethosu/regor/compiler/quantization.hpp | 94 + ethosu/regor/compiler/raw_writer.cpp | 260 + ethosu/regor/compiler/raw_writer.hpp | 53 + ethosu/regor/compiler/scheduler.cpp | 1742 ++ ethosu/regor/compiler/scheduler.hpp | 340 + ethosu/regor/compiler/scheduler_decompose.cpp | 326 + ethosu/regor/compiler/scheduler_decompose.hpp | 36 + ethosu/regor/compiler/scheduler_operation.hpp | 305 + ethosu/regor/compiler/scheduler_packing.cpp | 493 + ethosu/regor/compiler/scheduler_packing.hpp | 70 + ethosu/regor/compiler/softmax.cpp | 530 + ethosu/regor/compiler/softmax.hpp | 53 + ethosu/regor/compiler/tensor.cpp | 118 + ethosu/regor/compiler/tensor.hpp | 95 + ethosu/regor/compiler/tensor_allocator.cpp | 149 + ethosu/regor/compiler/tensor_allocator.hpp | 64 + ethosu/regor/compiler/tensor_properties.hpp | 90 + .../regor/compiler/tflite_graph_optimiser.cpp | 2936 ++ .../regor/compiler/tflite_graph_optimiser.hpp | 312 + .../compiler/tflite_graph_optimiser_tp.cpp | 145 + .../regor/compiler/tosa_graph_validator.cpp | 92 + .../regor/compiler/tosa_graph_validator.hpp | 39 + .../dependencies/mlw_codec/CMakeLists.txt | 183 + .../mlw_codec/include/mlw_decode.h | 50 + .../mlw_codec/include/mlw_encode.h | 114 + .../mlw_codec/source/ml_bit_buffer.hpp | 255 + .../mlw_codec/source/ml_encoder_internal.hpp | 128 + .../mlw_codec/source/ml_ethosu_encode.cpp | 114 + .../mlw_codec/source/ml_raw_buffer.hpp | 161 + .../mlw_codec/source/mlw_decode.cpp | 361 + .../mlw_codec/source/mlw_encode.cpp | 961 + .../mlw_codec/source/mlw_encode_fwd.cpp | 197 + .../thirdparty/Catch2/BUILD.bazel | 95 + .../Catch2/CMake/Catch2Config.cmake.in | 10 + .../Catch2/CMake/CatchConfigOptions.cmake | 89 + .../Catch2/CMake/CatchMiscFunctions.cmake | 121 + .../thirdparty/Catch2/CMake/FindGcov.cmake | 157 + .../thirdparty/Catch2/CMake/FindLcov.cmake | 354 + .../thirdparty/Catch2/CMake/Findcodecov.cmake | 258 + .../Catch2/CMake/catch2-with-main.pc.in | 10 + .../thirdparty/Catch2/CMake/catch2.pc.in | 11 + .../thirdparty/Catch2/CMake/llvm-cov-wrapper | 56 + .../thirdparty/Catch2/CMakeLists.txt | 203 + .../thirdparty/Catch2/CMakePresets.json | 26 + .../thirdparty/Catch2/CODE_OF_CONDUCT.md | 46 + .../dependencies/thirdparty/Catch2/Doxyfile | 2650 ++ .../thirdparty/Catch2/LICENSE.txt | 23 + .../thirdparty/Catch2/MODULE.bazel | 3 + .../dependencies/thirdparty/Catch2/README.md | 103 + .../thirdparty/Catch2/SECURITY.md | 19 + .../thirdparty/Catch2/WORKSPACE.bazel | 16 + .../thirdparty/Catch2/appveyor.yml | 83 + .../thirdparty/Catch2/codecov.yml | 22 + .../thirdparty/Catch2/conanfile.py | 87 + .../thirdparty/Catch2/extras/Catch.cmake | 304 + .../Catch2/extras/CatchAddTests.cmake | 192 + .../Catch2/extras/CatchShardTests.cmake | 74 + .../Catch2/extras/CatchShardTestsImpl.cmake | 52 + .../Catch2/extras/ParseAndAddCatchTests.cmake | 252 + .../Catch2/extras/catch_amalgamated.cpp | 11516 ++++++++ .../Catch2/extras/catch_amalgamated.hpp | 13915 +++++++++ .../thirdparty/Catch2/extras/gdbinit | 16 + .../thirdparty/Catch2/extras/lldbinit | 16 + .../thirdparty/Catch2/fuzzing/CMakeLists.txt | 20 + .../thirdparty/Catch2/fuzzing/NullOStream.cpp | 18 + .../thirdparty/Catch2/fuzzing/NullOStream.h | 28 + .../Catch2/fuzzing/build_fuzzers.sh | 33 + .../Catch2/fuzzing/fuzz_TestSpecParser.cpp | 22 + .../Catch2/fuzzing/fuzz_XmlWriter.cpp | 22 + .../Catch2/fuzzing/fuzz_textflow.cpp | 53 + .../thirdparty/Catch2/mdsnippets.json | 9 + .../thirdparty/Catch2/meson.build | 19 + .../thirdparty/Catch2/meson_options.txt | 1 + .../thirdparty/Catch2/src/CMakeLists.txt | 524 + .../src/catch2/benchmark/catch_benchmark.hpp | 148 + .../catch2/benchmark/catch_benchmark_all.hpp | 46 + .../catch2/benchmark/catch_chronometer.cpp | 17 + .../catch2/benchmark/catch_chronometer.hpp | 77 + .../src/catch2/benchmark/catch_clock.hpp | 27 + .../catch2/benchmark/catch_constructor.hpp | 82 + .../catch2/benchmark/catch_environment.hpp | 29 + .../src/catch2/benchmark/catch_estimate.hpp | 25 + .../catch2/benchmark/catch_execution_plan.hpp | 58 + .../src/catch2/benchmark/catch_optimizer.hpp | 78 + .../catch_outlier_classification.hpp | 29 + .../benchmark/catch_sample_analysis.hpp | 31 + .../catch2/benchmark/detail/catch_analyse.cpp | 85 + .../catch2/benchmark/detail/catch_analyse.hpp | 27 + .../detail/catch_benchmark_function.cpp | 17 + .../detail/catch_benchmark_function.hpp | 107 + .../detail/catch_benchmark_stats.hpp | 48 + .../detail/catch_benchmark_stats_fwd.hpp | 23 + .../detail/catch_complete_invoke.hpp | 58 + .../benchmark/detail/catch_estimate_clock.hpp | 125 + .../catch2/benchmark/detail/catch_measure.hpp | 32 + .../catch2/benchmark/detail/catch_repeat.hpp | 36 + .../detail/catch_run_for_at_least.cpp | 31 + .../detail/catch_run_for_at_least.hpp | 65 + .../catch2/benchmark/detail/catch_stats.cpp | 392 + .../catch2/benchmark/detail/catch_stats.hpp | 60 + .../catch2/benchmark/detail/catch_timing.hpp | 31 + .../Catch2/src/catch2/catch_all.hpp | 135 + .../Catch2/src/catch2/catch_approx.cpp | 85 + .../Catch2/src/catch2/catch_approx.hpp | 128 + .../src/catch2/catch_assertion_info.hpp | 28 + .../src/catch2/catch_assertion_result.cpp | 105 + .../src/catch2/catch_assertion_result.hpp | 60 + .../Catch2/src/catch2/catch_config.cpp | 247 + .../Catch2/src/catch2/catch_config.hpp | 153 + .../src/catch2/catch_get_random_seed.cpp | 18 + .../src/catch2/catch_get_random_seed.hpp | 18 + .../Catch2/src/catch2/catch_message.cpp | 117 + .../Catch2/src/catch2/catch_message.hpp | 150 + .../Catch2/src/catch2/catch_registry_hub.cpp | 106 + .../Catch2/src/catch2/catch_section_info.hpp | 42 + .../Catch2/src/catch2/catch_session.cpp | 364 + .../Catch2/src/catch2/catch_session.hpp | 62 + .../Catch2/src/catch2/catch_tag_alias.hpp | 29 + .../catch2/catch_tag_alias_autoregistrar.cpp | 24 + .../catch2/catch_tag_alias_autoregistrar.hpp | 29 + .../src/catch2/catch_template_test_macros.hpp | 124 + .../src/catch2/catch_test_case_info.cpp | 266 + .../src/catch2/catch_test_case_info.hpp | 132 + .../Catch2/src/catch2/catch_test_macros.hpp | 226 + .../Catch2/src/catch2/catch_test_spec.cpp | 141 + .../Catch2/src/catch2/catch_test_spec.hpp | 119 + .../Catch2/src/catch2/catch_timer.cpp | 37 + .../Catch2/src/catch2/catch_timer.hpp | 27 + .../Catch2/src/catch2/catch_tostring.cpp | 254 + .../Catch2/src/catch2/catch_tostring.hpp | 674 + .../Catch2/src/catch2/catch_totals.cpp | 65 + .../Catch2/src/catch2/catch_totals.hpp | 41 + .../src/catch2/catch_translate_exception.cpp | 20 + .../src/catch2/catch_translate_exception.hpp | 88 + .../src/catch2/catch_user_config.hpp.in | 220 + .../Catch2/src/catch2/catch_version.cpp | 43 + .../Catch2/src/catch2/catch_version.hpp | 39 + .../src/catch2/catch_version_macros.hpp | 15 + .../generators/catch_generator_exception.cpp | 17 + .../generators/catch_generator_exception.hpp | 31 + .../catch2/generators/catch_generators.cpp | 42 + .../catch2/generators/catch_generators.hpp | 244 + .../generators/catch_generators_adapters.hpp | 241 + .../generators/catch_generators_all.hpp | 30 + .../generators/catch_generators_random.cpp | 41 + .../generators/catch_generators_random.hpp | 107 + .../generators/catch_generators_range.hpp | 111 + .../interfaces/catch_interfaces_all.hpp | 37 + .../interfaces/catch_interfaces_capture.cpp | 13 + .../interfaces/catch_interfaces_capture.hpp | 110 + .../interfaces/catch_interfaces_config.cpp | 13 + .../interfaces/catch_interfaces_config.hpp | 100 + .../catch_interfaces_enum_values_registry.hpp | 47 + .../interfaces/catch_interfaces_exception.cpp | 14 + .../interfaces/catch_interfaces_exception.hpp | 36 + .../catch_interfaces_generatortracker.cpp | 32 + .../catch_interfaces_generatortracker.hpp | 90 + .../catch_interfaces_registry_hub.cpp | 14 + .../catch_interfaces_registry_hub.hpp | 66 + .../interfaces/catch_interfaces_reporter.cpp | 93 + .../interfaces/catch_interfaces_reporter.hpp | 223 + .../catch_interfaces_reporter_factory.cpp | 14 + .../catch_interfaces_reporter_factory.hpp | 45 + .../catch_interfaces_tag_alias_registry.hpp | 29 + .../catch_interfaces_test_invoker.hpp | 21 + .../interfaces/catch_interfaces_testcase.cpp | 13 + .../interfaces/catch_interfaces_testcase.hpp | 30 + .../internal/catch_assertion_handler.cpp | 82 + .../internal/catch_assertion_handler.hpp | 68 + .../catch_case_insensitive_comparisons.cpp | 35 + .../catch_case_insensitive_comparisons.hpp | 30 + .../catch2/internal/catch_case_sensitive.hpp | 17 + .../src/catch2/internal/catch_clara.cpp | 464 + .../src/catch2/internal/catch_clara.hpp | 750 + .../src/catch2/internal/catch_commandline.cpp | 314 + .../src/catch2/internal/catch_commandline.hpp | 21 + .../catch2/internal/catch_compare_traits.hpp | 75 + .../internal/catch_compiler_capabilities.hpp | 447 + .../catch_config_android_logwrite.hpp | 33 + .../catch2/internal/catch_config_counter.hpp | 34 + .../internal/catch_config_prefix_messages.hpp | 29 + .../catch_config_static_analysis_support.hpp | 34 + .../catch_config_uncaught_exceptions.hpp | 46 + .../catch2/internal/catch_config_wchar.hpp | 35 + .../catch2/internal/catch_console_colour.cpp | 282 + .../catch2/internal/catch_console_colour.hpp | 141 + .../catch2/internal/catch_console_width.hpp | 19 + .../internal/catch_container_nonmembers.hpp | 73 + .../src/catch2/internal/catch_context.cpp | 41 + .../src/catch2/internal/catch_context.hpp | 51 + .../catch2/internal/catch_debug_console.cpp | 45 + .../catch2/internal/catch_debug_console.hpp | 17 + .../src/catch2/internal/catch_debugger.cpp | 120 + .../src/catch2/internal/catch_debugger.hpp | 67 + .../src/catch2/internal/catch_decomposer.cpp | 28 + .../src/catch2/internal/catch_decomposer.hpp | 452 + .../src/catch2/internal/catch_enforce.cpp | 41 + .../src/catch2/internal/catch_enforce.hpp | 54 + .../internal/catch_enum_values_registry.cpp | 73 + .../internal/catch_enum_values_registry.hpp | 36 + .../src/catch2/internal/catch_errno_guard.cpp | 16 + .../src/catch2/internal/catch_errno_guard.hpp | 27 + .../catch_exception_translator_registry.cpp | 87 + .../catch_exception_translator_registry.hpp | 30 + .../catch_fatal_condition_handler.cpp | 244 + .../catch_fatal_condition_handler.hpp | 66 + .../internal/catch_floating_point_helpers.cpp | 43 + .../internal/catch_floating_point_helpers.hpp | 108 + .../src/catch2/internal/catch_getenv.cpp | 37 + .../src/catch2/internal/catch_getenv.hpp | 20 + .../catch2/internal/catch_is_permutation.hpp | 138 + .../src/catch2/internal/catch_istream.cpp | 154 + .../src/catch2/internal/catch_istream.hpp | 54 + .../src/catch2/internal/catch_jsonwriter.cpp | 148 + .../src/catch2/internal/catch_jsonwriter.hpp | 120 + .../src/catch2/internal/catch_lazy_expr.cpp | 29 + .../src/catch2/internal/catch_lazy_expr.hpp | 40 + .../catch2/internal/catch_leak_detector.cpp | 38 + .../catch2/internal/catch_leak_detector.hpp | 19 + .../Catch2/src/catch2/internal/catch_list.cpp | 120 + .../Catch2/src/catch2/internal/catch_list.hpp | 43 + .../catch2/internal/catch_logical_traits.hpp | 44 + .../Catch2/src/catch2/internal/catch_main.cpp | 39 + .../catch2/internal/catch_message_info.cpp | 25 + .../catch2/internal/catch_message_info.hpp | 42 + .../Catch2/src/catch2/internal/catch_meta.hpp | 47 + .../internal/catch_move_and_forward.hpp | 19 + .../src/catch2/internal/catch_noncopyable.hpp | 28 + .../src/catch2/internal/catch_optional.hpp | 117 + .../catch2/internal/catch_output_redirect.cpp | 146 + .../catch2/internal/catch_output_redirect.hpp | 118 + .../catch2/internal/catch_parse_numbers.cpp | 52 + .../catch2/internal/catch_parse_numbers.hpp | 26 + .../src/catch2/internal/catch_platform.hpp | 37 + .../src/catch2/internal/catch_polyfills.cpp | 42 + .../src/catch2/internal/catch_polyfills.hpp | 21 + .../catch2/internal/catch_preprocessor.hpp | 237 + .../catch_preprocessor_internal_stringify.hpp | 19 + .../catch_preprocessor_remove_parens.hpp | 19 + .../catch_random_floating_point_helpers.hpp | 94 + .../internal/catch_random_integer_helpers.hpp | 201 + .../catch_random_number_generator.cpp | 70 + .../catch_random_number_generator.hpp | 59 + .../internal/catch_random_seed_generation.cpp | 35 + .../internal/catch_random_seed_generation.hpp | 26 + .../internal/catch_reporter_registry.cpp | 91 + .../internal/catch_reporter_registry.hpp | 55 + .../internal/catch_reporter_spec_parser.cpp | 173 + .../internal/catch_reporter_spec_parser.hpp | 85 + .../src/catch2/internal/catch_result_type.cpp | 26 + .../src/catch2/internal/catch_result_type.hpp | 57 + .../internal/catch_reusable_string_stream.cpp | 62 + .../internal/catch_reusable_string_stream.hpp | 57 + .../src/catch2/internal/catch_run_context.cpp | 700 + .../src/catch2/internal/catch_run_context.hpp | 161 + .../src/catch2/internal/catch_section.cpp | 60 + .../src/catch2/internal/catch_section.hpp | 104 + .../src/catch2/internal/catch_sharding.hpp | 41 + .../src/catch2/internal/catch_singletons.cpp | 36 + .../src/catch2/internal/catch_singletons.hpp | 45 + .../internal/catch_source_line_info.cpp | 33 + .../internal/catch_source_line_info.hpp | 37 + .../catch_startup_exception_registry.cpp | 29 + .../catch_startup_exception_registry.hpp | 29 + .../src/catch2/internal/catch_stdstreams.cpp | 24 + .../src/catch2/internal/catch_stdstreams.hpp | 22 + .../catch2/internal/catch_stream_end_stop.hpp | 30 + .../catch2/internal/catch_string_manip.cpp | 116 + .../catch2/internal/catch_string_manip.hpp | 61 + .../src/catch2/internal/catch_stringref.cpp | 66 + .../src/catch2/internal/catch_stringref.hpp | 123 + .../internal/catch_tag_alias_registry.cpp | 54 + .../internal/catch_tag_alias_registry.hpp | 33 + .../internal/catch_template_test_registry.hpp | 337 + .../internal/catch_test_case_info_hasher.cpp | 39 + .../internal/catch_test_case_info_hasher.hpp | 29 + .../catch_test_case_registry_impl.cpp | 151 + .../catch_test_case_registry_impl.hpp | 57 + .../internal/catch_test_case_tracker.cpp | 239 + .../internal/catch_test_case_tracker.hpp | 244 + .../internal/catch_test_failure_exception.cpp | 31 + .../internal/catch_test_failure_exception.hpp | 34 + .../catch2/internal/catch_test_macro_impl.hpp | 157 + .../catch2/internal/catch_test_registry.cpp | 82 + .../catch2/internal/catch_test_registry.hpp | 173 + .../catch2/internal/catch_test_run_info.hpp | 22 + .../internal/catch_test_spec_parser.cpp | 239 + .../internal/catch_test_spec_parser.hpp | 81 + .../src/catch2/internal/catch_textflow.cpp | 268 + .../src/catch2/internal/catch_textflow.hpp | 189 + .../src/catch2/internal/catch_to_string.hpp | 29 + .../internal/catch_uncaught_exceptions.cpp | 25 + .../internal/catch_uncaught_exceptions.hpp | 15 + ...ch_uniform_floating_point_distribution.hpp | 131 + .../catch_uniform_integer_distribution.hpp | 124 + .../src/catch2/internal/catch_unique_name.hpp | 20 + .../src/catch2/internal/catch_unique_ptr.hpp | 118 + .../src/catch2/internal/catch_void_type.hpp | 25 + .../internal/catch_wildcard_pattern.cpp | 47 + .../internal/catch_wildcard_pattern.hpp | 38 + .../catch2/internal/catch_windows_h_proxy.hpp | 28 + .../src/catch2/internal/catch_xmlwriter.cpp | 348 + .../src/catch2/internal/catch_xmlwriter.hpp | 152 + .../src/catch2/matchers/catch_matchers.cpp | 25 + .../src/catch2/matchers/catch_matchers.hpp | 237 + .../catch2/matchers/catch_matchers_all.hpp | 36 + .../catch_matchers_container_properties.cpp | 34 + .../catch_matchers_container_properties.hpp | 90 + .../matchers/catch_matchers_contains.hpp | 102 + .../matchers/catch_matchers_exception.cpp | 26 + .../matchers/catch_matchers_exception.hpp | 61 + .../catch_matchers_floating_point.cpp | 226 + .../catch_matchers_floating_point.hpp | 94 + .../matchers/catch_matchers_predicate.cpp | 17 + .../matchers/catch_matchers_predicate.hpp | 59 + .../matchers/catch_matchers_quantifiers.cpp | 24 + .../matchers/catch_matchers_quantifiers.hpp | 165 + .../matchers/catch_matchers_range_equals.hpp | 144 + .../catch2/matchers/catch_matchers_string.cpp | 114 + .../catch2/matchers/catch_matchers_string.hpp | 85 + .../matchers/catch_matchers_templated.cpp | 41 + .../matchers/catch_matchers_templated.hpp | 296 + .../catch2/matchers/catch_matchers_vector.hpp | 194 + .../matchers/internal/catch_matchers_impl.cpp | 25 + .../matchers/internal/catch_matchers_impl.hpp | 88 + .../thirdparty/Catch2/src/catch2/meson.build | 392 + .../reporters/catch_reporter_automake.cpp | 37 + .../reporters/catch_reporter_automake.hpp | 38 + .../reporters/catch_reporter_common_base.cpp | 49 + .../reporters/catch_reporter_common_base.hpp | 79 + .../reporters/catch_reporter_compact.cpp | 254 + .../reporters/catch_reporter_compact.hpp | 39 + .../reporters/catch_reporter_console.cpp | 665 + .../reporters/catch_reporter_console.hpp | 67 + .../catch_reporter_cumulative_base.cpp | 158 + .../catch_reporter_cumulative_base.hpp | 151 + .../catch_reporter_event_listener.cpp | 40 + .../catch_reporter_event_listener.hpp | 60 + .../reporters/catch_reporter_helpers.cpp | 343 + .../reporters/catch_reporter_helpers.hpp | 95 + .../catch2/reporters/catch_reporter_json.cpp | 372 + .../catch2/reporters/catch_reporter_json.hpp | 95 + .../catch2/reporters/catch_reporter_junit.cpp | 309 + .../catch2/reporters/catch_reporter_junit.hpp | 56 + .../catch2/reporters/catch_reporter_multi.cpp | 197 + .../catch2/reporters/catch_reporter_multi.hpp | 72 + .../reporters/catch_reporter_registrars.cpp | 36 + .../reporters/catch_reporter_registrars.hpp | 131 + .../reporters/catch_reporter_sonarqube.cpp | 162 + .../reporters/catch_reporter_sonarqube.hpp | 59 + .../catch_reporter_streaming_base.cpp | 23 + .../catch_reporter_streaming_base.hpp | 73 + .../catch2/reporters/catch_reporter_tap.cpp | 228 + .../catch2/reporters/catch_reporter_tap.hpp | 42 + .../reporters/catch_reporter_teamcity.cpp | 177 + .../reporters/catch_reporter_teamcity.hpp | 66 + .../catch2/reporters/catch_reporter_xml.cpp | 333 + .../catch2/reporters/catch_reporter_xml.hpp | 66 + .../catch2/reporters/catch_reporters_all.hpp | 41 + .../thirdparty/Catch2/third_party/clara.hpp | 1267 + .../Catch2/tools/misc/CMakeLists.txt | 11 + .../Catch2/tools/misc/SelfTest.vcxproj.user | 23 + .../misc/appveyorBuildConfigurationScript.bat | 21 + .../tools/misc/appveyorMergeCoverageScript.py | 9 + .../tools/misc/appveyorTestRunScript.bat | 17 + .../Catch2/tools/misc/coverage-helper.cpp | 142 + .../tools/misc/installOpenCppCoverage.ps1 | 19 + .../Catch2/tools/scripts/approvalTests.py | 243 + .../Catch2/tools/scripts/approve.py | 31 + .../Catch2/tools/scripts/buildAndTest.cmd | 17 + .../Catch2/tools/scripts/buildAndTest.sh | 19 + .../tools/scripts/checkConvenienceHeaders.py | 151 + .../tools/scripts/checkDuplicateFilenames.py | 14 + .../Catch2/tools/scripts/checkLicense.py | 46 + .../Catch2/tools/scripts/developBuild.py | 9 + .../extractFeaturesFromReleaseNotes.py | 92 + .../Catch2/tools/scripts/fixWhitespace.py | 51 + .../tools/scripts/generateAmalgamatedFiles.py | 139 + .../Catch2/tools/scripts/majorRelease.py | 9 + .../Catch2/tools/scripts/minorRelease.py | 9 + .../Catch2/tools/scripts/patchRelease.py | 9 + .../Catch2/tools/scripts/releaseCommon.py | 143 + .../Catch2/tools/scripts/scriptCommon.py | 4 + .../tools/scripts/updateDocumentSnippets.py | 23 + .../Catch2/tools/scripts/updateDocumentToC.py | 447 + .../thirdparty/download_thirdparty.sh | 118 + .../thirdparty/flatbuffers/BUILD.bazel | 139 + .../thirdparty/flatbuffers/CHANGELOG.md | 155 + .../thirdparty/flatbuffers/CMakeLists.txt | 712 + .../thirdparty/flatbuffers/CONTRIBUTING.md | 42 + .../flatbuffers/FlatBuffers.podspec | 21 + .../thirdparty/flatbuffers/Formatters.md | 22 + .../thirdparty/flatbuffers/LICENSE | 202 + .../thirdparty/flatbuffers/Package.swift | 37 + .../flatbuffers/Package@swift-5.5.swift | 37 + .../thirdparty/flatbuffers/README.md | 121 + .../thirdparty/flatbuffers/SECURITY.md | 11 + .../thirdparty/flatbuffers/WORKSPACE | 156 + .../thirdparty/flatbuffers/build_defs.bzl | 280 + .../thirdparty/flatbuffers/composer.json | 18 + .../thirdparty/flatbuffers/conanfile.py | 75 + .../include/flatbuffers/allocator.h | 68 + .../flatbuffers/include/flatbuffers/array.h | 256 + .../flatbuffers/include/flatbuffers/base.h | 495 + .../flatbuffers/include/flatbuffers/buffer.h | 199 + .../include/flatbuffers/buffer_ref.h | 53 + .../include/flatbuffers/code_generator.h | 97 + .../include/flatbuffers/code_generators.h | 238 + .../include/flatbuffers/default_allocator.h | 64 + .../include/flatbuffers/detached_buffer.h | 114 + .../include/flatbuffers/file_manager.h | 48 + .../include/flatbuffers/flatbuffer_builder.h | 1465 + .../include/flatbuffers/flatbuffers.h | 284 + .../flatbuffers/include/flatbuffers/flatc.h | 131 + .../include/flatbuffers/flex_flat_util.h | 36 + .../include/flatbuffers/flexbuffers.h | 1887 ++ .../flatbuffers/include/flatbuffers/grpc.h | 299 + .../flatbuffers/include/flatbuffers/hash.h | 127 + .../flatbuffers/include/flatbuffers/idl.h | 1254 + .../include/flatbuffers/minireflect.h | 420 + .../include/flatbuffers/pch/flatc_pch.h | 39 + .../flatbuffers/include/flatbuffers/pch/pch.h | 38 + .../include/flatbuffers/reflection.h | 523 + .../flatbuffers/reflection_generated.h | 1487 + .../include/flatbuffers/registry.h | 130 + .../include/flatbuffers/stl_emulation.h | 513 + .../flatbuffers/include/flatbuffers/string.h | 64 + .../flatbuffers/include/flatbuffers/struct.h | 53 + .../flatbuffers/include/flatbuffers/table.h | 188 + .../flatbuffers/include/flatbuffers/util.h | 732 + .../flatbuffers/include/flatbuffers/vector.h | 397 + .../include/flatbuffers/vector_downward.h | 289 + .../include/flatbuffers/verifier.h | 332 + .../thirdparty/flatbuffers/package.json | 46 + .../thirdparty/flatbuffers/pnpm-lock.yaml | 1184 + .../thirdparty/flatbuffers/swift.swiftformat | 27 + .../thirdparty/flatbuffers/tsconfig.json | 15 + .../thirdparty/flatbuffers/tsconfig.mjs.json | 15 + .../thirdparty/flatbuffers/typescript.bzl | 90 + .../thirdparty/fmt/CMakeLists.txt | 453 + .../thirdparty/fmt/CONTRIBUTING.md | 20 + .../dependencies/thirdparty/fmt/ChangeLog.md | 5533 ++++ .../regor/dependencies/thirdparty/fmt/LICENSE | 27 + .../dependencies/thirdparty/fmt/README.md | 490 + .../thirdparty/fmt/include/fmt/args.h | 235 + .../thirdparty/fmt/include/fmt/chrono.h | 2240 ++ .../thirdparty/fmt/include/fmt/color.h | 643 + .../thirdparty/fmt/include/fmt/compile.h | 535 + .../thirdparty/fmt/include/fmt/core.h | 2969 ++ .../thirdparty/fmt/include/fmt/format-inl.h | 1678 ++ .../thirdparty/fmt/include/fmt/format.h | 4535 +++ .../thirdparty/fmt/include/fmt/os.h | 455 + .../thirdparty/fmt/include/fmt/ostream.h | 245 + .../thirdparty/fmt/include/fmt/printf.h | 675 + .../thirdparty/fmt/include/fmt/ranges.h | 738 + .../thirdparty/fmt/include/fmt/std.h | 537 + .../thirdparty/fmt/include/fmt/xchar.h | 259 + .../dependencies/thirdparty/fmt/src/fmt.cc | 108 + .../dependencies/thirdparty/fmt/src/format.cc | 43 + .../dependencies/thirdparty/fmt/src/os.cc | 402 + .../dependencies/thirdparty/gemmlowp/AUTHORS | 14 + .../dependencies/thirdparty/gemmlowp/BUILD | 232 + .../thirdparty/gemmlowp/CONTRIBUTING | 53 + .../thirdparty/gemmlowp/CONTRIBUTORS | 40 + .../dependencies/thirdparty/gemmlowp/LICENSE | 202 + .../thirdparty/gemmlowp/Makefile.travis | 27 + .../thirdparty/gemmlowp/README.md | 276 + .../thirdparty/gemmlowp/WORKSPACE | 0 .../gemmlowp/fixedpoint/fixedpoint.h | 914 + .../gemmlowp/fixedpoint/fixedpoint_avx.h | 384 + .../gemmlowp/fixedpoint/fixedpoint_msa.h | 413 + .../gemmlowp/fixedpoint/fixedpoint_neon.h | 357 + .../gemmlowp/fixedpoint/fixedpoint_sse.h | 388 + .../gemmlowp/fixedpoint/fixedpoint_wasmsimd.h | 381 + .../thirdparty/gemmlowp/flags.bzl | 11 + .../thirdparty/gemmlowp/internal/allocator.h | 197 + .../gemmlowp/internal/block_params.h | 177 + .../thirdparty/gemmlowp/internal/common.h | 184 + .../thirdparty/gemmlowp/internal/compute.h | 118 + .../gemmlowp/internal/detect_platform.h | 171 + .../gemmlowp/internal/dispatch_gemm_shape.h | 207 + .../thirdparty/gemmlowp/internal/kernel.h | 251 + .../thirdparty/gemmlowp/internal/kernel_avx.h | 361 + .../gemmlowp/internal/kernel_default.h | 112 + .../thirdparty/gemmlowp/internal/kernel_msa.h | 579 + .../gemmlowp/internal/kernel_neon.h | 1913 ++ .../gemmlowp/internal/kernel_reference.h | 118 + .../thirdparty/gemmlowp/internal/kernel_sse.h | 519 + .../gemmlowp/internal/multi_thread_gemm.h | 721 + .../thirdparty/gemmlowp/internal/output.h | 579 + .../thirdparty/gemmlowp/internal/output_avx.h | 19 + .../thirdparty/gemmlowp/internal/output_msa.h | 1111 + .../gemmlowp/internal/output_neon.h | 922 + .../thirdparty/gemmlowp/internal/output_sse.h | 561 + .../thirdparty/gemmlowp/internal/pack.h | 444 + .../thirdparty/gemmlowp/internal/pack_avx.h | 282 + .../thirdparty/gemmlowp/internal/pack_msa.h | 431 + .../thirdparty/gemmlowp/internal/pack_neon.h | 384 + .../thirdparty/gemmlowp/internal/pack_sse.h | 128 + .../thirdparty/gemmlowp/internal/platform.h | 117 + .../gemmlowp/internal/simd_wrappers.h | 669 + .../internal/simd_wrappers_common_neon_sse.h | 850 + .../gemmlowp/internal/simd_wrappers_msa.h | 191 + .../gemmlowp/internal/simd_wrappers_neon.h | 551 + .../gemmlowp/internal/simd_wrappers_sse.h | 149 + .../gemmlowp/internal/single_thread_gemm.h | 157 + .../thirdparty/gemmlowp/internal/unpack.h | 280 + .../thirdparty/pybind11/CMakeLists.txt | 322 + .../dependencies/thirdparty/pybind11/LICENSE | 29 + .../thirdparty/pybind11/MANIFEST.in | 6 + .../thirdparty/pybind11/README.rst | 180 + .../thirdparty/pybind11/SECURITY.md | 13 + .../pybind11/include/pybind11/attr.h | 690 + .../pybind11/include/pybind11/buffer_info.h | 208 + .../pybind11/include/pybind11/cast.h | 1704 ++ .../pybind11/include/pybind11/chrono.h | 225 + .../pybind11/include/pybind11/common.h | 2 + .../pybind11/include/pybind11/complex.h | 74 + .../pybind11/include/pybind11/detail/class.h | 743 + .../pybind11/include/pybind11/detail/common.h | 1255 + .../pybind11/include/pybind11/detail/descr.h | 171 + .../pybind11/include/pybind11/detail/init.h | 434 + .../include/pybind11/detail/internals.h | 656 + .../pybind11/detail/type_caster_base.h | 1177 + .../pybind11/include/pybind11/detail/typeid.h | 65 + .../pybind11/include/pybind11/eigen.h | 12 + .../pybind11/include/pybind11/eigen/common.h | 9 + .../pybind11/include/pybind11/eigen/matrix.h | 714 + .../pybind11/include/pybind11/eigen/tensor.h | 516 + .../pybind11/include/pybind11/embed.h | 316 + .../pybind11/include/pybind11/eval.h | 156 + .../pybind11/include/pybind11/functional.h | 137 + .../pybind11/include/pybind11/gil.h | 239 + .../pybind11/include/pybind11/iostream.h | 265 + .../pybind11/include/pybind11/numpy.h | 1998 ++ .../pybind11/include/pybind11/operators.h | 202 + .../pybind11/include/pybind11/options.h | 92 + .../pybind11/include/pybind11/pybind11.h | 2890 ++ .../pybind11/include/pybind11/pytypes.h | 2557 ++ .../pybind11/include/pybind11/stl.h | 447 + .../include/pybind11/stl/filesystem.h | 116 + .../pybind11/include/pybind11/stl_bind.h | 851 + .../pybind11/type_caster_pyobject_ptr.h | 61 + .../thirdparty/pybind11/noxfile.py | 107 + .../thirdparty/pybind11/pybind11/__init__.py | 17 + .../thirdparty/pybind11/pybind11/__main__.py | 62 + .../thirdparty/pybind11/pybind11/_version.py | 12 + .../thirdparty/pybind11/pybind11/commands.py | 37 + .../thirdparty/pybind11/pybind11/py.typed | 0 .../pybind11/pybind11/setup_helpers.py | 498 + .../thirdparty/pybind11/pyproject.toml | 98 + .../thirdparty/pybind11/setup.cfg | 43 + .../dependencies/thirdparty/pybind11/setup.py | 150 + .../thirdparty/pybind11/tools/FindCatch.cmake | 76 + .../pybind11/tools/FindEigen3.cmake | 86 + .../pybind11/tools/FindPythonLibsNew.cmake | 287 + .../thirdparty/pybind11/tools/JoinPaths.cmake | 23 + .../thirdparty/pybind11/tools/check-style.sh | 44 + .../pybind11/tools/cmake_uninstall.cmake.in | 23 + .../codespell_ignore_lines_from_errors.py | 39 + .../thirdparty/pybind11/tools/libsize.py | 36 + .../pybind11/tools/make_changelog.py | 62 + .../thirdparty/pybind11/tools/pybind11.pc.in | 7 + .../pybind11/tools/pybind11Common.cmake | 405 + .../pybind11/tools/pybind11Config.cmake.in | 233 + .../pybind11/tools/pybind11NewTools.cmake | 256 + .../pybind11/tools/pybind11Tools.cmake | 233 + .../thirdparty/pybind11/tools/pyproject.toml | 3 + .../pybind11/tools/setup_global.py.in | 63 + .../pybind11/tools/setup_main.py.in | 44 + ethosu/regor/include/graphapi.hpp | 295 + ethosu/regor/include/graphapi_attr.hpp | 110 + ethosu/regor/include/graphapi_tosa_types.hpp | 127 + ethosu/regor/include/regor.h | 214 + ethosu/regor/include/regor_database.hpp | 57 + ethosu/regor/include/regor_interface.hpp | 45 + ethosu/regor/regor.cpp | 591 + ethosu/regor/test/CMakeLists.txt | 45 + ethosu/regor/test/randomize.hpp | 156 + ethosu/regor/test/test_arch_ethos_u85.cpp | 101 + ethosu/regor/test/test_data_type.cpp | 145 + .../test/test_ethos_u85_weight_encoder.cpp | 123 + ethosu/regor/test/test_graph_packing.cpp | 292 + ethosu/regor/test/test_ini_reader.cpp | 184 + ethosu/regor/test/test_main.cpp | 77 + ethosu/regor/test/test_mlw_encode.cpp | 260 + ethosu/regor/test/test_ordered_map.cpp | 1255 + ethosu/regor/test/test_raw_writer.cpp | 206 + ethosu/regor/test/test_shape.cpp | 412 + ethosu/regor/test/test_tosa_validator.cpp | 89 + ethosu/regor/test/test_transpose_type.cpp | 56 + .../regor/tflite/custom_operator_ethosu.hpp | 212 + ethosu/regor/tflite/flatbuffer_utils.hpp | 49 + ethosu/regor/tflite/tflite_mapping.cpp | 717 + ethosu/regor/tflite/tflite_mapping.hpp | 135 + .../regor/tflite/tflite_model_semantics.cpp | 976 + .../regor/tflite/tflite_model_semantics.hpp | 37 + ethosu/regor/tflite/tflite_reader.cpp | 777 + ethosu/regor/tflite/tflite_reader.hpp | 54 + .../regor/tflite/tflite_schema_generated.hpp | 17552 +++++++++++ ethosu/regor/tflite/tflite_writer.cpp | 778 + ethosu/regor/tflite/tflite_writer.hpp | 134 + ethosu/regor/tools/cppcheck.py | 73 + ethosu/regor/tosa/tosaValidationGenerator.rb | 830 + ethosu/regor/tosa/tosa_argument_checks.cpp | 335 + ethosu/regor/tosa/tosa_argument_checks.hpp | 60 + ethosu/regor/tosa/tosa_error_checks.cpp | 1486 + ethosu/regor/tosa/tosa_error_checks.hpp | 161 + ethosu/regor/tosa/tosa_level_checks.cpp | 269 + ethosu/regor/tosa/tosa_level_checks.hpp | 60 + ethosu/regor/tosa/tosa_mapping.cpp | 171 + ethosu/regor/tosa/tosa_mapping.hpp | 41 + ethosu/regor/tosa/tosa_reader.cpp | 879 + ethosu/regor/tosa/tosa_reader.hpp | 40 + ethosu/regor/tosa/tosa_require_checks.cpp | 104 + ethosu/regor/tosa/tosa_require_checks.hpp | 42 + ethosu/regor/tosa/tosa_schema_generated.hpp | 3099 ++ ethosu/regor/tosa/tosa_validator.cpp | 42 + ethosu/regor/tosa/tosa_validator.hpp | 62 + ...sa_validator_version_0_60_0_profile_bi.cpp | 2898 ++ ethosu/vela/__init__.py | 3 +- ethosu/vela/__main__.py | 3 +- ethosu/vela/_version.py | 2 +- ethosu/vela/architecture_features.py | 137 +- ethosu/vela/rawdata_writer.py | 26 +- ethosu/vela/stats_writer.py | 554 +- .../test/test_tflite_supported_operators.py | 4 +- ethosu/vela/tflite_graph_optimiser.py | 2 +- ethosu/vela/tflite_reader.py | 4 +- ethosu/vela/tflite_writer.py | 4 +- ethosu/vela/vela.py | 664 +- pyproject.toml | 28 +- setup.cfg | 6 +- setup.py | 198 +- test/network.py | 79 + test/test_ethos_u_vela.py | 132 + 754 files changed, 263735 insertions(+), 412 deletions(-) create mode 100644 .clang-format create mode 100644 THIRDPARTY.md create mode 100644 ethosu/regor/CMakeLists.txt create mode 100644 ethosu/regor/architecture/architecture.cpp create mode 100644 ethosu/regor/architecture/architecture.hpp create mode 100644 ethosu/regor/architecture/ethos_u_register_cs_generator.hpp create mode 100644 ethosu/regor/architecture/ethos_u_scaling.cpp create mode 100644 ethosu/regor/architecture/ethos_u_scaling.hpp create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55.cpp create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55.hpp create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_scaling.hpp create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.cpp create mode 100644 ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.hpp create mode 100644 ethosu/regor/architecture/ethosu65/ethos_u65.cpp create mode 100644 ethosu/regor/architecture/ethosu65/ethos_u65.hpp create mode 100644 ethosu/regor/architecture/ethosu65/ethos_u65_interface.hpp create mode 100644 ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.cpp create mode 100644 ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.hpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85.cpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85.hpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_interface.hpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_scaling.hpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.cpp create mode 100644 ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.hpp create mode 100644 ethosu/regor/architecture/mlw_encode.cpp create mode 100644 ethosu/regor/architecture/mlw_encode.hpp create mode 100644 ethosu/regor/architecture/register_command_stream_generator.hpp create mode 100644 ethosu/regor/architecture/weight_encoder.hpp create mode 100644 ethosu/regor/bindings/python/py_regor.cpp create mode 100644 ethosu/regor/cmake/cpack_config.cmake create mode 100644 ethosu/regor/cmake/pkg-config.cmake.in create mode 100644 ethosu/regor/cmake/regor_dependencies.cmake create mode 100644 ethosu/regor/cmake/regor_lib.cmake create mode 100644 ethosu/regor/cmake/regor_options.cmake create mode 100644 ethosu/regor/cmake/regor_test.cmake create mode 100644 ethosu/regor/cmake/regor_thirdparty.cmake create mode 100644 ethosu/regor/cmake/toolchains/clang.cmake create mode 100644 ethosu/regor/cmake/toolchains/clang32.cmake create mode 100644 ethosu/regor/cmake/toolchains/gcc.cmake create mode 100644 ethosu/regor/cmake/toolchains/gcc32.cmake create mode 100644 ethosu/regor/cmake/utils.cmake create mode 100644 ethosu/regor/common/bit_flags.hpp create mode 100644 ethosu/regor/common/box.hpp create mode 100644 ethosu/regor/common/buffer_view.hpp create mode 100644 ethosu/regor/common/common.cpp create mode 100644 ethosu/regor/common/common.hpp create mode 100644 ethosu/regor/common/data_type.cpp create mode 100644 ethosu/regor/common/data_type.hpp create mode 100644 ethosu/regor/common/dynamic_typing.hpp create mode 100644 ethosu/regor/common/ini_reader.hpp create mode 100644 ethosu/regor/common/lexer.hpp create mode 100644 ethosu/regor/common/logging.cpp create mode 100644 ethosu/regor/common/logging.hpp create mode 100644 ethosu/regor/common/numeric_util.hpp create mode 100644 ethosu/regor/common/ordered_map.hpp create mode 100644 ethosu/regor/common/reverse_type.cpp create mode 100644 ethosu/regor/common/reverse_type.hpp create mode 100644 ethosu/regor/common/scaling.cpp create mode 100644 ethosu/regor/common/scaling.hpp create mode 100644 ethosu/regor/common/shape.hpp create mode 100644 ethosu/regor/common/transpose_type.cpp create mode 100644 ethosu/regor/common/transpose_type.hpp create mode 100644 ethosu/regor/common/vector_span.hpp create mode 100644 ethosu/regor/compiler/attributes.cpp create mode 100644 ethosu/regor/compiler/attributes.hpp create mode 100644 ethosu/regor/compiler/cascade_builder.cpp create mode 100644 ethosu/regor/compiler/cascade_builder.hpp create mode 100644 ethosu/regor/compiler/compiler.cpp create mode 100644 ethosu/regor/compiler/compiler.hpp create mode 100644 ethosu/regor/compiler/database.hpp create mode 100644 ethosu/regor/compiler/faststorage_allocator.cpp create mode 100644 ethosu/regor/compiler/faststorage_allocator.hpp create mode 100644 ethosu/regor/compiler/graph.hpp create mode 100644 ethosu/regor/compiler/graph_builder.cpp create mode 100644 ethosu/regor/compiler/graph_builder.hpp create mode 100644 ethosu/regor/compiler/graph_optimiser.cpp create mode 100644 ethosu/regor/compiler/graph_optimiser.hpp create mode 100644 ethosu/regor/compiler/graph_optimiser_db.hpp create mode 100644 ethosu/regor/compiler/graph_packing.cpp create mode 100644 ethosu/regor/compiler/graph_packing.hpp create mode 100644 ethosu/regor/compiler/graph_validator.cpp create mode 100644 ethosu/regor/compiler/graph_validator.hpp create mode 100644 ethosu/regor/compiler/graphir_optimiser.cpp create mode 100644 ethosu/regor/compiler/graphir_optimiser.hpp create mode 100644 ethosu/regor/compiler/high_level_command_stream.hpp create mode 100644 ethosu/regor/compiler/high_level_command_stream_generator.cpp create mode 100644 ethosu/regor/compiler/high_level_command_stream_generator.hpp create mode 100644 ethosu/regor/compiler/hillclimb_allocator.cpp create mode 100644 ethosu/regor/compiler/hillclimb_allocator.hpp create mode 100644 ethosu/regor/compiler/kernel.hpp create mode 100644 ethosu/regor/compiler/live_range.cpp create mode 100644 ethosu/regor/compiler/live_range.hpp create mode 100644 ethosu/regor/compiler/network_performance.cpp create mode 100644 ethosu/regor/compiler/network_performance.hpp create mode 100644 ethosu/regor/compiler/op_type.cpp create mode 100644 ethosu/regor/compiler/op_type.hpp create mode 100644 ethosu/regor/compiler/operation.cpp create mode 100644 ethosu/regor/compiler/operation.hpp create mode 100644 ethosu/regor/compiler/operation_util.hpp create mode 100644 ethosu/regor/compiler/optimiser_utils.cpp create mode 100644 ethosu/regor/compiler/optimiser_utils.hpp create mode 100644 ethosu/regor/compiler/quantization.cpp create mode 100644 ethosu/regor/compiler/quantization.hpp create mode 100644 ethosu/regor/compiler/raw_writer.cpp create mode 100644 ethosu/regor/compiler/raw_writer.hpp create mode 100644 ethosu/regor/compiler/scheduler.cpp create mode 100644 ethosu/regor/compiler/scheduler.hpp create mode 100644 ethosu/regor/compiler/scheduler_decompose.cpp create mode 100644 ethosu/regor/compiler/scheduler_decompose.hpp create mode 100644 ethosu/regor/compiler/scheduler_operation.hpp create mode 100644 ethosu/regor/compiler/scheduler_packing.cpp create mode 100644 ethosu/regor/compiler/scheduler_packing.hpp create mode 100644 ethosu/regor/compiler/softmax.cpp create mode 100644 ethosu/regor/compiler/softmax.hpp create mode 100644 ethosu/regor/compiler/tensor.cpp create mode 100644 ethosu/regor/compiler/tensor.hpp create mode 100644 ethosu/regor/compiler/tensor_allocator.cpp create mode 100644 ethosu/regor/compiler/tensor_allocator.hpp create mode 100644 ethosu/regor/compiler/tensor_properties.hpp create mode 100644 ethosu/regor/compiler/tflite_graph_optimiser.cpp create mode 100644 ethosu/regor/compiler/tflite_graph_optimiser.hpp create mode 100644 ethosu/regor/compiler/tflite_graph_optimiser_tp.cpp create mode 100644 ethosu/regor/compiler/tosa_graph_validator.cpp create mode 100644 ethosu/regor/compiler/tosa_graph_validator.hpp create mode 100644 ethosu/regor/dependencies/mlw_codec/CMakeLists.txt create mode 100644 ethosu/regor/dependencies/mlw_codec/include/mlw_decode.h create mode 100644 ethosu/regor/dependencies/mlw_codec/include/mlw_encode.h create mode 100644 ethosu/regor/dependencies/mlw_codec/source/ml_bit_buffer.hpp create mode 100644 ethosu/regor/dependencies/mlw_codec/source/ml_encoder_internal.hpp create mode 100644 ethosu/regor/dependencies/mlw_codec/source/ml_ethosu_encode.cpp create mode 100644 ethosu/regor/dependencies/mlw_codec/source/ml_raw_buffer.hpp create mode 100644 ethosu/regor/dependencies/mlw_codec/source/mlw_decode.cpp create mode 100644 ethosu/regor/dependencies/mlw_codec/source/mlw_encode.cpp create mode 100644 ethosu/regor/dependencies/mlw_codec/source/mlw_encode_fwd.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/BUILD.bazel create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/Catch2Config.cmake.in create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchConfigOptions.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchMiscFunctions.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindGcov.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindLcov.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/Findcodecov.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2-with-main.pc.in create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2.pc.in create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/CMake/llvm-cov-wrapper create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMakeLists.txt create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CMakePresets.json create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/CODE_OF_CONDUCT.md create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/Doxyfile create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/LICENSE.txt create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/MODULE.bazel create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/README.md create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/SECURITY.md create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/WORKSPACE.bazel create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/appveyor.yml create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/codecov.yml create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/conanfile.py create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/Catch.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchAddTests.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchShardTests.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/CatchShardTestsImpl.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/ParseAndAddCatchTests.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/catch_amalgamated.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/catch_amalgamated.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/gdbinit create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/extras/lldbinit create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/CMakeLists.txt create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/NullOStream.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/NullOStream.h create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/build_fuzzers.sh create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_TestSpecParser.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_XmlWriter.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/fuzzing/fuzz_textflow.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/mdsnippets.json create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/meson.build create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/meson_options.txt create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/CMakeLists.txt create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_benchmark.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_benchmark_all.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_chronometer.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_chronometer.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_clock.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_constructor.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_environment.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_estimate.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_execution_plan.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_optimizer.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_outlier_classification.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/catch_sample_analysis.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_analyse.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_analyse.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_function.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_function.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_benchmark_stats_fwd.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_complete_invoke.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_estimate_clock.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_measure.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_repeat.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_run_for_at_least.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_stats.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_stats.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/benchmark/detail/catch_timing.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_all.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_approx.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_approx.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_info.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_result.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_assertion_result.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_config.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_config.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_get_random_seed.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_get_random_seed.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_message.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_message.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_registry_hub.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_section_info.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_session.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_session.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias_autoregistrar.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tag_alias_autoregistrar.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_template_test_macros.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_case_info.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_case_info.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_macros.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_spec.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_test_spec.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_timer.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_timer.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tostring.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_tostring.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_totals.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_totals.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_translate_exception.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_translate_exception.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_user_config.hpp.in create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/catch_version_macros.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generator_exception.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generator_exception.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_adapters.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_all.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_random.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_random.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/generators/catch_generators_range.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_all.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_capture.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_capture.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_config.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_config.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_enum_values_registry.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_exception.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_exception.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_generatortracker.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_generatortracker.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_registry_hub.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_registry_hub.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter_factory.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_reporter_factory.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_tag_alias_registry.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_test_invoker.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_testcase.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/interfaces/catch_interfaces_testcase.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_assertion_handler.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_assertion_handler.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_insensitive_comparisons.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_insensitive_comparisons.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_case_sensitive.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_clara.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_clara.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_commandline.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_commandline.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_compare_traits.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_compiler_capabilities.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_android_logwrite.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_counter.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_prefix_messages.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_static_analysis_support.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_uncaught_exceptions.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_config_wchar.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_colour.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_colour.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_console_width.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_container_nonmembers.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_context.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_context.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debug_console.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debug_console.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debugger.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_debugger.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_decomposer.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_decomposer.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enforce.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enforce.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enum_values_registry.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_enum_values_registry.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_errno_guard.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_errno_guard.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_exception_translator_registry.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_exception_translator_registry.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_fatal_condition_handler.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_fatal_condition_handler.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_floating_point_helpers.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_floating_point_helpers.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_getenv.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_getenv.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_is_permutation.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_istream.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_istream.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_jsonwriter.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_jsonwriter.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_lazy_expr.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_lazy_expr.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_leak_detector.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_leak_detector.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_list.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_list.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_logical_traits.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_main.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_message_info.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_message_info.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_meta.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_move_and_forward.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_noncopyable.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_optional.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_output_redirect.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_output_redirect.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_parse_numbers.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_parse_numbers.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_platform.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_polyfills.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_polyfills.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor_internal_stringify.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_preprocessor_remove_parens.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_floating_point_helpers.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_integer_helpers.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_number_generator.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_number_generator.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_seed_generation.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_random_seed_generation.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_registry.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_registry.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_spec_parser.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reporter_spec_parser.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_result_type.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_result_type.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reusable_string_stream.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_reusable_string_stream.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_run_context.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_run_context.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_section.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_section.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_sharding.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_singletons.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_singletons.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_source_line_info.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_source_line_info.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_startup_exception_registry.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_startup_exception_registry.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stdstreams.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stdstreams.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stream_end_stop.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_string_manip.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_string_manip.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stringref.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_stringref.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_tag_alias_registry.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_tag_alias_registry.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_template_test_registry.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_info_hasher.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_info_hasher.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_registry_impl.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_registry_impl.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_tracker.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_case_tracker.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_failure_exception.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_failure_exception.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_macro_impl.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_registry.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_registry.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_run_info.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_spec_parser.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_test_spec_parser.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_textflow.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_textflow.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_to_string.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uncaught_exceptions.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uncaught_exceptions.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uniform_floating_point_distribution.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_uniform_integer_distribution.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_unique_name.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_unique_ptr.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_void_type.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_wildcard_pattern.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_wildcard_pattern.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_windows_h_proxy.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_xmlwriter.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/internal/catch_xmlwriter.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_all.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_container_properties.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_container_properties.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_contains.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_exception.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_exception.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_floating_point.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_floating_point.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_predicate.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_predicate.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_quantifiers.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_quantifiers.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_range_equals.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_string.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_string.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_templated.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_templated.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/catch_matchers_vector.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/internal/catch_matchers_impl.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/matchers/internal/catch_matchers_impl.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/meson.build create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_automake.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_automake.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_common_base.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_common_base.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_compact.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_compact.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_console.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_console.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_cumulative_base.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_cumulative_base.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_event_listener.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_event_listener.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_helpers.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_helpers.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_json.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_json.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_junit.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_junit.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_multi.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_multi.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_registrars.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_registrars.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_sonarqube.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_sonarqube.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_streaming_base.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_streaming_base.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_tap.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_tap.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_teamcity.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_teamcity.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_xml.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporter_xml.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/src/catch2/reporters/catch_reporters_all.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/third_party/clara.hpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/CMakeLists.txt create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/SelfTest.vcxproj.user create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorBuildConfigurationScript.bat create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorMergeCoverageScript.py create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/appveyorTestRunScript.bat create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/coverage-helper.cpp create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/misc/installOpenCppCoverage.ps1 create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/approvalTests.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/approve.py create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/buildAndTest.cmd create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/buildAndTest.sh create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkConvenienceHeaders.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkDuplicateFilenames.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/checkLicense.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/developBuild.py create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/extractFeaturesFromReleaseNotes.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/fixWhitespace.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/generateAmalgamatedFiles.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/majorRelease.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/minorRelease.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/patchRelease.py create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/releaseCommon.py create mode 100644 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/scriptCommon.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/updateDocumentSnippets.py create mode 100755 ethosu/regor/dependencies/thirdparty/Catch2/tools/scripts/updateDocumentToC.py create mode 100644 ethosu/regor/dependencies/thirdparty/download_thirdparty.sh create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/BUILD.bazel create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/CHANGELOG.md create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/CMakeLists.txt create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/CONTRIBUTING.md create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/FlatBuffers.podspec create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/Formatters.md create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/LICENSE create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/Package.swift create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/Package@swift-5.5.swift create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/README.md create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/SECURITY.md create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/WORKSPACE create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/build_defs.bzl create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/composer.json create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/conanfile.py create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/allocator.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/array.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/base.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/buffer.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/buffer_ref.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/code_generator.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/code_generators.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/default_allocator.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/detached_buffer.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/file_manager.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatbuffer_builder.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatbuffers.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flatc.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flex_flat_util.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/flexbuffers.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/grpc.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/hash.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/idl.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/minireflect.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/pch/flatc_pch.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/pch/pch.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/reflection.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/reflection_generated.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/registry.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/stl_emulation.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/string.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/struct.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/table.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/util.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/vector.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/vector_downward.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/include/flatbuffers/verifier.h create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/package.json create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/pnpm-lock.yaml create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/swift.swiftformat create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/tsconfig.json create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/tsconfig.mjs.json create mode 100644 ethosu/regor/dependencies/thirdparty/flatbuffers/typescript.bzl create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/CMakeLists.txt create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/CONTRIBUTING.md create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/ChangeLog.md create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/LICENSE create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/README.md create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/args.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/chrono.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/color.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/compile.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/core.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/format-inl.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/format.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/os.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/ostream.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/printf.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/ranges.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/std.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/include/fmt/xchar.h create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/src/fmt.cc create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/src/format.cc create mode 100644 ethosu/regor/dependencies/thirdparty/fmt/src/os.cc create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/AUTHORS create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/BUILD create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/CONTRIBUTING create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/CONTRIBUTORS create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/LICENSE create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/Makefile.travis create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/README.md create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/WORKSPACE create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_avx.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_msa.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_neon.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_sse.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/fixedpoint/fixedpoint_wasmsimd.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/flags.bzl create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/allocator.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/block_params.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/common.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/compute.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/detect_platform.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/dispatch_gemm_shape.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_avx.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_default.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_msa.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_neon.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_reference.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/kernel_sse.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/multi_thread_gemm.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_avx.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_msa.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_neon.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/output_sse.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_avx.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_msa.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_neon.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/pack_sse.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/platform.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_common_neon_sse.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_msa.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_neon.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/simd_wrappers_sse.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/single_thread_gemm.h create mode 100644 ethosu/regor/dependencies/thirdparty/gemmlowp/internal/unpack.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/CMakeLists.txt create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/LICENSE create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/MANIFEST.in create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/README.rst create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/SECURITY.md create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/attr.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/buffer_info.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/cast.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/chrono.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/common.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/complex.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/class.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/common.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/descr.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/init.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/internals.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/type_caster_base.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/detail/typeid.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/common.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/matrix.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eigen/tensor.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/embed.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/eval.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/functional.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/gil.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/iostream.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/numpy.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/operators.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/options.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/pybind11.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/pytypes.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl/filesystem.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/stl_bind.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/include/pybind11/type_caster_pyobject_ptr.h create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/noxfile.py create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/__init__.py create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/__main__.py create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/_version.py create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/commands.py create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/py.typed create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pybind11/setup_helpers.py create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/pyproject.toml create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/setup.cfg create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/setup.py create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/FindCatch.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/FindEigen3.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/FindPythonLibsNew.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/JoinPaths.cmake create mode 100755 ethosu/regor/dependencies/thirdparty/pybind11/tools/check-style.sh create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/cmake_uninstall.cmake.in create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/codespell_ignore_lines_from_errors.py create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/libsize.py create mode 100755 ethosu/regor/dependencies/thirdparty/pybind11/tools/make_changelog.py create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11.pc.in create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Common.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Config.cmake.in create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11NewTools.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pybind11Tools.cmake create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/pyproject.toml create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/setup_global.py.in create mode 100644 ethosu/regor/dependencies/thirdparty/pybind11/tools/setup_main.py.in create mode 100644 ethosu/regor/include/graphapi.hpp create mode 100644 ethosu/regor/include/graphapi_attr.hpp create mode 100644 ethosu/regor/include/graphapi_tosa_types.hpp create mode 100644 ethosu/regor/include/regor.h create mode 100644 ethosu/regor/include/regor_database.hpp create mode 100644 ethosu/regor/include/regor_interface.hpp create mode 100644 ethosu/regor/regor.cpp create mode 100644 ethosu/regor/test/CMakeLists.txt create mode 100644 ethosu/regor/test/randomize.hpp create mode 100644 ethosu/regor/test/test_arch_ethos_u85.cpp create mode 100644 ethosu/regor/test/test_data_type.cpp create mode 100644 ethosu/regor/test/test_ethos_u85_weight_encoder.cpp create mode 100644 ethosu/regor/test/test_graph_packing.cpp create mode 100644 ethosu/regor/test/test_ini_reader.cpp create mode 100644 ethosu/regor/test/test_main.cpp create mode 100644 ethosu/regor/test/test_mlw_encode.cpp create mode 100644 ethosu/regor/test/test_ordered_map.cpp create mode 100644 ethosu/regor/test/test_raw_writer.cpp create mode 100644 ethosu/regor/test/test_shape.cpp create mode 100644 ethosu/regor/test/test_tosa_validator.cpp create mode 100644 ethosu/regor/test/test_transpose_type.cpp create mode 100644 ethosu/regor/tflite/custom_operator_ethosu.hpp create mode 100644 ethosu/regor/tflite/flatbuffer_utils.hpp create mode 100644 ethosu/regor/tflite/tflite_mapping.cpp create mode 100644 ethosu/regor/tflite/tflite_mapping.hpp create mode 100644 ethosu/regor/tflite/tflite_model_semantics.cpp create mode 100644 ethosu/regor/tflite/tflite_model_semantics.hpp create mode 100644 ethosu/regor/tflite/tflite_reader.cpp create mode 100644 ethosu/regor/tflite/tflite_reader.hpp create mode 100644 ethosu/regor/tflite/tflite_schema_generated.hpp create mode 100644 ethosu/regor/tflite/tflite_writer.cpp create mode 100644 ethosu/regor/tflite/tflite_writer.hpp create mode 100755 ethosu/regor/tools/cppcheck.py create mode 100755 ethosu/regor/tosa/tosaValidationGenerator.rb create mode 100644 ethosu/regor/tosa/tosa_argument_checks.cpp create mode 100644 ethosu/regor/tosa/tosa_argument_checks.hpp create mode 100644 ethosu/regor/tosa/tosa_error_checks.cpp create mode 100644 ethosu/regor/tosa/tosa_error_checks.hpp create mode 100644 ethosu/regor/tosa/tosa_level_checks.cpp create mode 100644 ethosu/regor/tosa/tosa_level_checks.hpp create mode 100644 ethosu/regor/tosa/tosa_mapping.cpp create mode 100644 ethosu/regor/tosa/tosa_mapping.hpp create mode 100644 ethosu/regor/tosa/tosa_reader.cpp create mode 100644 ethosu/regor/tosa/tosa_reader.hpp create mode 100644 ethosu/regor/tosa/tosa_require_checks.cpp create mode 100644 ethosu/regor/tosa/tosa_require_checks.hpp create mode 100644 ethosu/regor/tosa/tosa_schema_generated.hpp create mode 100644 ethosu/regor/tosa/tosa_validator.cpp create mode 100644 ethosu/regor/tosa/tosa_validator.hpp create mode 100644 ethosu/regor/tosa/tosa_validator_version_0_60_0_profile_bi.cpp mode change 100644 => 100755 ethosu/vela/vela.py create mode 100644 test/network.py create mode 100644 test/test_ethos_u_vela.py diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..778d3ab0 --- /dev/null +++ b/.clang-format @@ -0,0 +1,214 @@ +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -4 +AlignAfterOpenBracket: DontAlign +# AlignArrayOfStructures: Left +AlignConsecutiveMacros: None +AlignConsecutiveAssignments: None +AlignConsecutiveBitFields: None +AlignConsecutiveDeclarations: None +AlignEscapedNewlines: DontAlign +AlignOperands: Align +AlignTrailingComments: true +#AllowAllArgumentsOnNextLine: true +#AllowAllConstructorInitializersOnNextLine: true +#AllowAllParametersOfDeclarationOnNextLine: true +AllowShortEnumsOnASingleLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: InlineOnly +#AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: AllIfsAndElse +AllowShortLoopsOnASingleLine: false +#AlwaysBreakAfterDefinitionReturnType: None +#AlwaysBreakAfterReturnType: None +#AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: Yes +#AttributeMacros: +# - __capability +#BinPackArguments: true +#BinPackParameters: true +BraceWrapping: + AfterCaseLabel: true + AfterClass: true + AfterControlStatement: Always + AfterEnum: true + AfterFunction: true + AfterNamespace: true + AfterObjCDeclaration: true + AfterStruct: true + AfterUnion: true + AfterExternBlock: false + BeforeCatch: true + BeforeElse: true + BeforeLambdaBody: true + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +#BreakBeforeBinaryOperators: None +#BreakBeforeConceptDeclarations: true +BreakBeforeBraces: Custom +#BreakBeforeInheritanceComma: false +BreakInheritanceList: AfterColon +BreakBeforeTernaryOperators: false +BreakConstructorInitializers: AfterColon +#BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: false +ColumnLimit: 120 +#CommentPragmas: '^ IWYU pragma:' +#CompactNamespaces: false +ConstructorInitializerIndentWidth: 8 +ContinuationIndentWidth: 4 +#Cpp11BracedListStyle: true +#DeriveLineEnding: true +DerivePointerAlignment: false +#DisableFormat: false +#EmptyLineAfterAccessModifier: Never +#EmptyLineBeforeAccessModifier: LogicalBlock +#ExperimentalAutoDetectBinPacking: false +#FixNamespaceComments: true +#ForEachMacros: +# - foreach +# - Q_FOREACH +# - BOOST_FOREACH +#IfMacros: +# - KJ_IF_MAYBE +IncludeBlocks: Regroup +IncludeCategories: +- Regex: \"common/common.hpp\" + Priority: 1 + SortPriority: 0 + CaseSensitive: false +- Regex: \"common/logging.hpp\" + Priority: 1 + SortPriority: 1 + CaseSensitive: false +#- Regex: \"common/.*\.hpp\" +# Priority: 1 +# SortPriority: 2 +# CaseSensitive: false +- Regex: \".*\.hpp\" + Priority: 2 + SortPriority: 3 + CaseSensitive: false +- Regex: ^ + Priority: 3 + SortPriority: 4 + CaseSensitive: false +- Regex: ^<.*\.h> + Priority: 3 + SortPriority: 5 + CaseSensitive: false +- Regex: ^<.* + Priority: 3 + SortPriority: 6 + CaseSensitive: false +#IncludeIsMainRegex: '([-_](test|unittest))?$' +#IncludeIsMainSourceRegex: '' +#IndentAccessModifiers: false +IndentCaseLabels: true +#IndentCaseBlocks: false +#IndentGotoLabels: true +IndentPPDirectives: None +IndentExternBlock: AfterExternBlock +#IndentRequires: false +IndentWidth: 4 +#IndentWrappedFunctionNames: false +#InsertTrailingCommas: None +#JavaScriptQuotes: Leave +#JavaScriptWrapImports: true +#KeepEmptyLinesAtTheStartOfBlocks: false +#LambdaBodyIndentation: Signature / OuterScope +MacroBlockBegin: ^BEGIN_FIELD_TABLE|^BEGIN_ENUM_TABLE +MacroBlockEnd: ^END_FIELD_TABLE|^END_ENUM_TABLE +MaxEmptyLinesToKeep: 3 +#NamespaceIndentation: None +#ObjCBinPackProtocolList: Never +#ObjCBlockIndentWidth: 2 +#ObjCBreakBeforeNestedBlockParam: true +#ObjCSpaceAfterProperty: false +#ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 350 +PenaltyBreakBeforeFirstCallParameter: 10 +PenaltyBreakComment: 10 +PenaltyBreakFirstLessLess: 10 +PenaltyBreakString: 100 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 10 +PenaltyReturnTypeOnItsOwnLine: 1000 +PenaltyIndentedWhitespace: 250 +PointerAlignment: Right +#PPIndentWidth: -1 +#RawStringFormats: +# - Language: Cpp +# Delimiters: +# - cc +# - CC +# - cpp +# - Cpp +# - CPP +# - 'c++' +# - 'C++' +# CanonicalDelimiter: '' +# BasedOnStyle: google +# - Language: TextProto +# Delimiters: +# - pb +# - PB +# - proto +# - PROTO +# EnclosingFunctions: +# - EqualsProto +# - EquivToProto +# - PARSE_PARTIAL_TEXT_PROTO +# - PARSE_TEST_PROTO +# - PARSE_TEXT_PROTO +# - ParseTextOrDie +# - ParseTextProtoOrDie +# - ParseTestProto +# - ParsePartialTestProto +# CanonicalDelimiter: pb +# BasedOnStyle: google +#ReferenceAlignment: Pointer +#ReflowComments: true +ShortNamespaceLines: 1 +SortIncludes: CaseInsensitive +#SortJavaStaticImport: Before +#SortUsingDeclarations: true +#SpaceAfterCStyleCast: false +#SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: false +#SpaceBeforeAssignmentOperators: true +#SpaceBeforeCaseColon: false +#SpaceBeforeCpp11BracedList: false +#SpaceBeforeCtorInitializerColon: true +#SpaceBeforeInheritanceColon: true +#SpaceBeforeParens: ControlStatements +#SpaceAroundPointerQualifiers: Default +#SpaceBeforeRangeBasedForLoopColon: true +#SpaceInEmptyBlock: false +#SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +#SpacesInAngles: Never +SpacesInConditionalStatement: true +#SpacesInContainerLiterals: true +#SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +#SpacesInParentheses: false +#SpacesInSquareBrackets: false +#SpaceBeforeSquareBrackets: false +BitFieldColonSpacing: None +#Standard: Auto +StatementAttributeLikeMacros: +- DLL_EXPORT +- DLL_IMPORT +StatementMacros: [LOG_PRINT, LOG_WARN, LOG_ERROR, DECLARE_ENUM_AS_FLAGS, REGOR_FIELD_TYPE] +TabWidth: 4 +#UseCRLF: false +#UseTab: Never +WhitespaceSensitiveMacros: +- MACRO_CONCAT diff --git a/.gitignore b/.gitignore index 070ea6c3..55b4c109 100644 --- a/.gitignore +++ b/.gitignore @@ -3,7 +3,7 @@ build/ ethos_u_vela.egg-info/ ethosu/*.so output/ -.vscode/ .coverage __pycache__ +*.pyc *.pyd diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 472cdf93..3b337df7 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright 2020-2023 Arm Limited and/or its affiliates +# SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates # SPDX-License-Identifier: Apache-2.0 @@ -14,16 +14,50 @@ # See the License for the specific language governing permissions and # limitations under the License. -exclude: '^ethosu/vela/(tflite|ethos_u55_regs|tosa)/' +exclude: ethosu/vela/(tflite|ethos_u55_regs|tosa)|ethosu/regor/dependencies|ethosu/regor/architecture/ethosu[68]5/ethos_u[68]5_interface.hpp|ethosu/regor/(tflite|tosa)/(tflite|tosa)_schema_generated.hpp|[A-Z_0-9]*.md + repos: +- repo: https://github.com/macisamuele/language-formatters-pre-commit-hooks + rev: v2.2.0 + hooks: + - id: pretty-format-yaml + name: Pretty format YAML + description: This hook sets a standard for formatting YAML files. + entry: pretty-format-yaml + language: python + types: [yaml] + minimum_pre_commit_version: '1' + args: [--autofix, --indent=4] + +- repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.0.1 + hooks: + - id: check-toml + - id: end-of-file-fixer + - id: trailing-whitespace + - id: mixed-line-ending + args: [--fix=lf] + description: Forces to replace line ending by the UNIX 'lf' character. + - id: detect-private-key + - id: check-executables-have-shebangs + - id: check-added-large-files + args: [--maxkb=256, --enforce-all] + +- repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.5.4 + hooks: + - id: remove-tabs + args: [--whitespaces-count, '4'] + exclude: (M|m)akefile + - repo: https://github.com/pre-commit/mirrors-mypy - rev: 'v0.931' + rev: v0.931 hooks: - id: mypy - args: ["--no-strict-optional", "--show-error-codes", "--ignore-missing-imports"] + args: [--no-strict-optional, --show-error-codes, --ignore-missing-imports] require_serial: true additional_dependencies: [types-setuptools] - minimum_pre_commit_version: '2.9.2' + minimum_pre_commit_version: 2.9.2 - repo: https://github.com/asottile/reorder_python_imports rev: v2.2.0 @@ -42,12 +76,25 @@ repos: hooks: - id: flake8 args: [--max-line-length=120, --extend-ignore=E203] + exclude: ^test/network.py|ethosu/regor/[A-Z_0-9]* - repo: https://github.com/pylint-dev/pylint rev: v2.13.9 hooks: - id: pylint - args: [--score=no, --max-line-length=120, --disable=all, --enable=W0102] + args: [--score=no, --max-line-length=120, --disable=all, --enable=W0102 --ignore=ethosu/regor] + +- repo: https://github.com/pre-commit/mirrors-clang-format + rev: v16.0.6 + hooks: + - id: clang-format + files: .*\.(hpp|cpp|h|c) + types_or: [c++, c] + args: [-style=file, -i] + require_serial: false + additional_dependencies: [] + minimum_pre_commit_version: 2.9.2 + exclude: ethosu/mlw_codec - repo: local hooks: @@ -58,7 +105,7 @@ repos: entry: pytest -s -v types: [python] pass_filenames: false - always_run: true + always_run: false - id: pytest-cov name: pytest-cov @@ -67,4 +114,4 @@ repos: entry: pytest -v --cov=ethosu --cov-fail-under=0 types: [python] pass_filenames: false - always_run: true + always_run: false diff --git a/README.md b/README.md index 95e7ddff..f7293c15 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Vela +**_NOTE:_** **Ethos-U85 support should be considered beta-quality** + This tool is used to compile a [TensorFlow Lite for Microcontrollers](https://www.tensorflow.org/lite/microcontrollers) neural network model into an optimised version that can run on an embedded @@ -330,8 +332,10 @@ Additional useful information: * [Arm Products: Ethos-U55 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u55) * [Arm Products: Ethos-U65 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u65) -* [Arm Developer: Ethos-U55 NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u/ethos-u55) -* [Arm Developer: Ethos-U65 NPU](https://developer.arm.com/ip-products/processors/machine-learning/arm-ethos-u/ethos-u65) +* [Arm Products: Ethos-U85 NPU](https://www.arm.com/products/silicon-ip-cpu/ethos/ethos-u85) +* [Arm Developer: Ethos-U55 NPU](https://developer.arm.com/Processors/Ethos-U55) +* [Arm Developer: Ethos-U65 NPU](https://developer.arm.com/Processors/Ethos-U65) +* [Arm Developer: Ethos-U85 NPU](https://developer.arm.com/Processors/Ethos-U85) ## Security diff --git a/TESTING.md b/TESTING.md index 91fe9cc1..0e296e39 100644 --- a/TESTING.md +++ b/TESTING.md @@ -1,5 +1,5 @@ +# Third Party Software + +The following lists the Third Party software versions and licenses used by Vela: +* FlatBuffers [v23.5.26](https://github.com/google/flatbuffers/releases/tag/v23.5.26) - +([Apache-2.0 License](https://github.com/google/flatbuffers/blob/v23.5.26/LICENSE)) + +* fmt [10.2.1](https://github.com/fmtlib/fmt/releases/tag/10.2.1) - +([MIT License](https://github.com/fmtlib/fmt/blob/10.2.1/LICENSE)) + +* Catch2 [v3.5.3](https://github.com/catchorg/Catch2/releases/tag/v3.5.3) - +([BSL-1.0 License](https://github.com/catchorg/Catch2/blob/v3.5.3/LICENSE.txt)) + +* Gemmlowp [09d81e02ab15b41405caebeb5eb63fd12555aee3](https://github.com/google/gemmlowp/tree/09d81e02ab15b41405caebeb5eb63fd12555aee3) - +([Apache-2.0 License](https://github.com/google/gemmlowp/blob/09d81e02ab15b41405caebeb5eb63fd12555aee3/LICENSE)) + +* pybind11 [v2.11.1](https://github.com/pybind/pybind11/releases/tag/v2.11.1) - +([BSD-3-Clause License](https://github.com/pybind/pybind11/blob/v2.11.1/LICENSE)) \ No newline at end of file diff --git a/ethosu/config_files/Arm/vela.ini b/ethosu/config_files/Arm/vela.ini index 50ad055f..12170c90 100644 --- a/ethosu/config_files/Arm/vela.ini +++ b/ethosu/config_files/Arm/vela.ini @@ -1,4 +1,4 @@ -; SPDX-FileCopyrightText: Copyright 2020-2021 Arm Limited and/or its affiliates +; SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates ; ; SPDX-License-Identifier: Apache-2.0 ; @@ -104,6 +104,86 @@ Dram_burst_length=128 Dram_read_latency=500 Dram_write_latency=250 +; SRAMx2 (6.4 GB/s) and Flash (0.2 GB/s) +[System_Config.Ethos_U85_SYS_Flash_Low] +core_clock=200e6 +axi0_port=Sram +axi1_port=OffChipFlash +Sram_clock_scale=1 +Sram_ports_used=2 +Sram_burst_length=64 +Sram_read_latency=8 +Sram_write_latency=8 +OffChipFlash_clock_scale=0.0625 +OffChipFlash_ports_used=1 +OffChipFlash_burst_length=128 +OffChipFlash_read_latency=32 +OffChipFlash_write_latency=32 + +; SRAMx2 (16 GB/s) and Flash (0.5 GB/s) +[System_Config.Ethos_U85_SYS_Flash_High] +core_clock=500e6 +axi0_port=Sram +axi1_port=OffChipFlash +Sram_clock_scale=1.0 +Sram_ports_used=2 +Sram_burst_length=64 +Sram_read_latency=32 +Sram_write_latency=32 +OffChipFlash_clock_scale=0.0625 +OffChipFlash_ports_used=1 +OffChipFlash_burst_length=128 +OffChipFlash_read_latency=64 +OffChipFlash_write_latency=64 + +; SRAMx2 (16 GB/s) and DRAMx1 (3.75 GB/s) +[System_Config.Ethos_U85_SYS_DRAM_Low] +core_clock=500e6 +axi0_port=Sram +axi1_port=Dram +Sram_clock_scale=1.0 +Sram_ports_used=2 +Sram_burst_length=64 +Sram_read_latency=16 +Sram_write_latency=16 +Dram_clock_scale=0.46875 +Dram_ports_used=1 +Dram_burst_length=128 +Dram_read_latency=500 +Dram_write_latency=250 + +; SRAMx2 (32 GB/s) and DRAM (12 GB/s) +[System_Config.Ethos_U85_SYS_DRAM_Mid] +core_clock=1e9 +axi0_port=Sram +axi1_port=Dram +Sram_clock_scale=1.0 +Sram_ports_used=2 +Sram_burst_length=64 +Sram_read_latency=32 +Sram_write_latency=32 +Dram_clock_scale=0.75 +Dram_ports_used=1 +Dram_burst_length=128 +Dram_read_latency=500 +Dram_write_latency=250 + +; SRAMx4 (64 GB/s) and DRAMx2 (24 GB/s) +[System_Config.Ethos_U85_SYS_DRAM_High] +core_clock=1e9 +axi0_port=Sram +axi1_port=Dram +Sram_clock_scale=1.0 +Sram_ports_used=4 +Sram_burst_length=64 +Sram_read_latency=32 +Sram_write_latency=32 +Dram_clock_scale=0.75 +Dram_ports_used=2 +Dram_burst_length=128 +Dram_read_latency=500 +Dram_write_latency=250 + ; ----------------------------------------------------------------------------- ; Memory Mode @@ -120,12 +200,23 @@ const_mem_area=Axi1 arena_mem_area=Axi0 cache_mem_area=Axi0 -; Dedicated SRAM: the SRAM (384KB) is only for use by the Ethos-U +; Dedicated SRAM: the SRAM is only for use by the Ethos-U ; The non-SRAM memory is assumed to be read-writeable [Memory_Mode.Dedicated_Sram] const_mem_area=Axi1 arena_mem_area=Axi1 cache_mem_area=Axi0 + +; Dedicated SRAM 256KB: the SRAM (256KB) is only for use by the Ethos-U +; The non-SRAM memory is assumed to be read-writeable +[Memory_Mode.Dedicated_Sram_256KB] +inherit=Memory_Mode.Dedicated_Sram +arena_cache_size=262144 + +; Dedicated SRAM 384KB: the SRAM (384KB) is only for use by the Ethos-U +; The non-SRAM memory is assumed to be read-writeable +[Memory_Mode.Dedicated_Sram_384KB] +inherit=Memory_Mode.Dedicated_Sram arena_cache_size=393216 ; Dedicated SRAM 512KB: the SRAM (512KB) is only for use by the Ethos-U diff --git a/ethosu/mlw_codec/mlw_decode.c b/ethosu/mlw_codec/mlw_decode.c index 264f3a6a..214b0317 100644 --- a/ethosu/mlw_codec/mlw_decode.c +++ b/ethosu/mlw_codec/mlw_decode.c @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright 2020, 2022 Arm Limited and/or its affiliates + * SPDX-FileCopyrightText: Copyright 2020, 2022-2023 Arm Limited and/or its affiliates * * SPDX-License-Identifier: Apache-2.0 * @@ -91,7 +91,6 @@ int mlw_decode( uint8_t *inbuf, int inbuf_size, int16_t **outbuf, int verbose) { int first=1; int use_zero_run, i, j; int outbuf_size=0; - int nchunks=0; *outbuf=0; @@ -260,7 +259,6 @@ int mlw_decode( uint8_t *inbuf, int inbuf_size, int16_t **outbuf, int verbose) { z_prev_enable = z_enable; z_prev_nsymbols = z_nsymbols; memcpy( z_prev_q, z_q, sizeof(z_prev_q)); - nchunks++; } while( w_prev_enable || z_prev_enable ); // Interleave non-zero and zeros into the outbut buffer diff --git a/ethosu/regor/CMakeLists.txt b/ethosu/regor/CMakeLists.txt new file mode 100644 index 00000000..f7615ff2 --- /dev/null +++ b/ethosu/regor/CMakeLists.txt @@ -0,0 +1,355 @@ +# +# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +cmake_minimum_required(VERSION 3.20.2) + +############################################################################# +# Policies +############################################################################# + +macro(regor_policy num val) + string(LENGTH "${num}" policy_var) + math(EXPR policy_var "4 - ${policy_var}") + string(REPEAT "0" ${policy_var} policy_var) + set(policy_var "CMP${policy_var}${num}") + if (POLICY ${policy_var}) + cmake_policy(SET ${policy_var} ${val}) + set(CMAKE_POLICY_DEFAULT_${policy_var} ${val}) + endif() + unset(policy_var) +endmacro() + +regor_policy(63 NEW) +regor_policy(69 NEW) +regor_policy(91 NEW) +regor_policy(92 NEW) +regor_policy(94 NEW) + +############################################################################# +# Project +############################################################################# + +if ("${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}") + project(regor + VERSION 0.1.0 + DESCRIPTION "Regor Ethos-U compiler" + LANGUAGES CXX) + + set_property(GLOBAL PROPERTY USE_FOLDERS ON) + + if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") + message(FATAL_ERROR "${CMAKE_PROJECT_NAME} requires an out of source build.") + endif() +endif() + +############################################################################# +# Folders +############################################################################# + +include(GNUInstallDirs) +# Set libdir in a consistent way +math(EXPR arch_bits "${CMAKE_SIZEOF_VOID_P}*8") +if (arch_bits EQUAL 64) + set(CMAKE_INSTALL_LIBDIR "lib64" CACHE PATH "Libdir path" FORCE) +endif() +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) +if (CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) + set(CMAKE_INSTALL_PREFIX "${CMAKE_BINARY_DIR}/install" CACHE PATH "Installation path" FORCE) +endif() + +############################################################################# +# Config +############################################################################# + +set(DEFAULT_CMAKE_BUILD_TYPE "Debug") +if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + message(STATUS "No build type selected, default to ${DEFAULT_CMAKE_BUILD_TYPE}") + set(CMAKE_BUILD_TYPE "${DEFAULT_CMAKE_BUILD_TYPE}" CACHE STRING "Build type (default ${DEFAULT_CMAKE_BUILD_TYPE})" FORCE) +endif() + +if (CMAKE_TOOLCHAIN_FILE) + message(STATUS "Toolchain file: ${CMAKE_TOOLCHAIN_FILE}") +endif() + +############################################################################# +# Options +############################################################################# + +# CCACHE support +find_program(CCACHE_PROGRAM ccache) +if (CCACHE_PROGRAM) + message(STATUS "Looking for CCACHE support - Success") + set(ENABLE_CCACHE_DEFAULT ON) +else() + message(STATUS "Looking for CCACHE support - Not found") + set(ENABLE_CCACHE_DEFAULT OFF) +endif() + +# LTO/IPO support. Only enable in Release mode +include(CheckIPOSupported) +check_ipo_supported(RESULT ENABLE_LTO_DEFAULT) +if (ENABLE_LTO_DEFAULT) + if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + set(ENABLE_LTO_DEFAULT OFF) + message(STATUS "Looking for IPO support - Success (default disabled)") + else() + message(STATUS "Looking for IPO support - Success") + endif() +else() + message(STATUS "Looking for IPO support - Not found") +endif() + +# Gold linker +find_program(LD_GOLD "ld.gold") +if (LD_GOLD) + message(STATUS "Looking for ld.gold support - Success (default disabled)") + set(ENABLE_LDGOLD_DEFAULT OFF) +else() + message(STATUS "Looking for ld.gold support - Not found") + set(ENABLE_LDGOLD_DEFAULT OFF) +endif() + +# Werror +if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + set(ENABLE_WERROR_DEFAULT OFF) +else() + set(ENABLE_WERROR_DEFAULT ON) +endif() + +# Asserts +if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + set(ENABLE_ASSERT_DEFAULT ON) +else() + set(ENABLE_ASSERT_DEFAULT OFF) +endif() + +option(REGOR_ENABLE_LTO "Link Time Optimization" ${ENABLE_LTO_DEFAULT}) +option(REGOR_ENABLE_LDGOLD "Enable Gold linker if available" ${ENABLE_LDGOLD_DEFAULT}) +option(REGOR_ENABLE_CCACHE "Enable ccache if available" ${ENABLE_CCACHE_DEFAULT}) +option(REGOR_ENABLE_WERROR "Warnings as errors" ${ENABLE_WERROR_DEFAULT}) +option(REGOR_ENABLE_STD_STATIC "Link libstdc and libgcc statically" OFF) +option(REGOR_ENABLE_COVERAGE "Enable coverage build" OFF) +option(REGOR_ENABLE_PROFILING "Enable timer based runtime profiling" OFF) +option(REGOR_ENABLE_ASSERT "Enable asserts" ${ENABLE_ASSERT_DEFAULT}) +option(REGOR_ENABLE_EXPENSIVE_CHECKS "Enable expensive STL GLICXX asserts" OFF) +option(REGOR_ENABLE_RTTI "Enable RTTI" OFF) +option(REGOR_ENABLE_VALGRIND "Enable valgrind during check target" OFF) +option(REGOR_ENABLE_TESTING "Enable unit testing" ON) +option(REGOR_ENABLE_CPPCHECK "Enable cppcheck" OFF) +set(REGOR_SANITIZE "" CACHE STRING "Sanitizer setting. For example undefined") +set(REGOR_LOG_TRACE_MASK "" CACHE STRING "Log trace enable mask") +set(REGOR_PACKAGE_NAME "${PROJECT_NAME}" CACHE STRING "CPack package name. Will be suffixed with platform tag") +set(REGOR_DEBUG_COMPRESSION "zlib-gnu" CACHE STRING "Debug symbol compression. none, zlib or zlib-gnu") +set(REGOR_PYTHON_BINDINGS_DESTINATION "" CACHE STRING "Python bindings install destination") +set(REGOR_PYEXT_VERSION "${${PROJECT_NAME}_VERSION}" CACHE STRING "Python extension version") + +############################################################################# +# General purpose +############################################################################# + +# Modules +list(APPEND CMAKE_MODULE_PATH + ${CMAKE_CURRENT_LIST_DIR}/cmake/ + ${CMAKE_CURRENT_LIST_DIR}/dependencies/thirdparty/Catch2 +) + +include(utils) + +# Python +utils_find_python() + +# Export compile commands and flags +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) + +# Action CCache +if (REGOR_ENABLE_CCACHE) + set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CCACHE_PROGRAM}") +endif() + +# Action CPPCHECK +if (REGOR_ENABLE_CPPCHECK) + find_program(CPPCHECK cppcheck) + if (CPPCHECK) + message(STATUS "Looking for cppcheck support - Success") + set(CMAKE_CXX_CPPCHECK ${CMAKE_COMMAND} -E env CMAKE_BINARY_DIR=${CMAKE_BINARY_DIR} + ${Python3_EXECUTABLE} ${CMAKE_CURRENT_LIST_DIR}/tools/cppcheck.py + ${CPPCHECK} --enable=all --inconclusive --quiet + --suppress=missingIncludeSystem + --suppress=*:${CMAKE_CURRENT_LIST_DIR}/dependencies/* + ) + else() + message(STATUS "Looking for cppcheck support - Not found") + endif() +endif() + +############################################################################# +# Modules +############################################################################# + +include(regor_options) +include(regor_lib) +include(regor_dependencies) + +############################################################################# +# Top level targets +############################################################################ + +find_package(Threads REQUIRED) + +if (REGOR_PYTHON_BINDINGS_DESTINATION AND NOT TARGET pybind11::module) + regor_add_dependency(dependencies/thirdparty/pybind11) +endif() + +set(REGOR_LIB_DEPS + Threads::Threads + regor::flatbuffers + regor::fmt + regor::gemmlowp + mlw_codec_st +) + +set(REGOR_HEADERS + "include/regor.h" + "include/regor_database.hpp" + "include/regor_interface.hpp" +) + +regor_lib( + NAME + regor-objects + TYPE OBJECT + DEFINES + "$<$:LOG_TRACE_ENABLE=${REGOR_LOG_TRACE_MASK}>" + DEPS + ${REGOR_LIB_DEPS} + PUBLIC_HEADERS + ${REGOR_HEADERS} + INC_DIRS + ${CMAKE_CURRENT_SOURCE_DIR} + SOURCES + "regor.cpp" + "common/common.cpp" + "common/data_type.cpp" + "common/logging.cpp" + "common/scaling.cpp" + "common/transpose_type.cpp" + "common/reverse_type.cpp" + "architecture/architecture.cpp" + "architecture/ethos_u_scaling.cpp" + "architecture/mlw_encode.cpp" + "architecture/ethosu55/ethos_u55.cpp" + "architecture/ethosu55/ethos_u55_scaling.cpp" + "architecture/ethosu55/ethos_u55_weight_encoder.cpp" + "architecture/ethosu55/ethos_u55_performance.cpp" + "architecture/ethosu55/ethos_u55_register_cs_generator.cpp" + "architecture/ethosu65/ethos_u65.cpp" + "architecture/ethosu65/ethos_u65_register_cs_generator.cpp" + "architecture/ethosu85/ethos_u85.cpp" + "architecture/ethosu85/ethos_u85_register_cs_generator.cpp" + "architecture/ethosu85/ethos_u85_scaling.cpp" + "architecture/ethosu85/ethos_u85_weight_encoder.cpp" + "architecture/ethosu85/ethos_u85_performance.cpp" + "compiler/attributes.cpp" + "compiler/compiler.cpp" + "compiler/faststorage_allocator.cpp" + "compiler/graph_builder.cpp" + "compiler/graph_packing.cpp" + "compiler/high_level_command_stream_generator.cpp" + "compiler/hillclimb_allocator.cpp" + "compiler/live_range.cpp" + "compiler/network_performance.cpp" + "compiler/operation.cpp" + "compiler/quantization.cpp" + "compiler/raw_writer.cpp" + "compiler/scheduler.cpp" + "compiler/scheduler_decompose.cpp" + "compiler/scheduler_packing.cpp" + "compiler/softmax.cpp" + "compiler/tensor.cpp" + "compiler/tensor_allocator.cpp" + "compiler/tflite_graph_optimiser.cpp" + "compiler/tflite_graph_optimiser_tp.cpp" + "compiler/cascade_builder.cpp" + "compiler/graph_optimiser.cpp" + "compiler/graphir_optimiser.cpp" + "compiler/optimiser_utils.cpp" + "compiler/graph_validator.cpp" + "compiler/tosa_graph_validator.cpp" + "compiler/op_type.cpp" + "tosa/tosa_validator.cpp" + "tosa/tosa_argument_checks.cpp" + "tosa/tosa_error_checks.cpp" + "tosa/tosa_level_checks.cpp" + "tosa/tosa_require_checks.cpp" + "tosa/tosa_validator_version_0_60_0_profile_bi.cpp" + "tosa/tosa_reader.cpp" + "tosa/tosa_mapping.cpp" + "tflite/tflite_reader.cpp" + "tflite/tflite_writer.cpp" + "tflite/tflite_mapping.cpp" + "tflite/tflite_model_semantics.cpp" +) + +regor_lib( + NAME + regor-static + OUTPUT_NAME + regor + COMPONENT regor + TYPE STATIC + DEPS + regor-objects + PUBLIC_HEADERS + ${REGOR_HEADERS} +) + +if (REGOR_PYTHON_BINDINGS_DESTINATION) + regor_lib( + NAME + PyRegor + OUTPUT_NAME + regor + COMPONENT python-bindings + INSTALL_LOCATION + ${REGOR_PYTHON_BINDINGS_DESTINATION} + TYPE PY_MODULE + DEFINES + REGOR_VERSION="${REGOR_PYEXT_VERSION}" + SOURCES + "bindings/python/py_regor.cpp" + DEPS + regor-objects + COPTS + "$,/GR,-frtti>" + ) +endif() + +############################################################################# +# Subdirs +############################################################################ + +if (REGOR_ENABLE_TESTING) + # Enable testing at the top level to get a top level check target + include(CTest) + add_subdirectory(test) +endif() + +# CPack last +include(cpack_config) diff --git a/ethosu/regor/architecture/architecture.cpp b/ethosu/regor/architecture/architecture.cpp new file mode 100644 index 00000000..1f90fc64 --- /dev/null +++ b/ethosu/regor/architecture/architecture.cpp @@ -0,0 +1,221 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "architecture.hpp" + +#include "common/logging.hpp" + +#include "common/bit_flags.hpp" + +BEGIN_ENUM_TABLE(regor::MemUsage) + ADD_ENUM_NAME(None) + ADD_ENUM_NAME(ReadOnly) + ADD_ENUM_NAME(FeatureMap) + ADD_ENUM_NAME(LUT) + ADD_ENUM_NAME(Staging) +END_ENUM_TABLE() + +namespace regor +{ + +IniParseResult Architecture::ParseSection(const std::string §ion, IniReader *reader) +{ + // Parse the architecture config (must happen first in INI file ). + if ( section == "architecture" ) + { + if ( !ParseConfig(reader) ) + { + return IniParseResult::Error; + } + } + // Parse memory definitions + else if ( section == "memory" || section.find("memory.") == 0 ) + { + if ( !ParseMemory(reader, section) ) + { + return IniParseResult::Error; + } + } + // Parse system configuration locally + else if ( section == "system" ) + { + std::string key; + std::string tmp; + while ( reader->Begin(key) ) + { + if ( key == "const" ) + { + tmp = reader->Get(); + SetReadonlyMemory(tmp); + } + else if ( key == "feature_maps" ) + { + tmp = reader->Get(); + SetFeatureMapMemory(tmp); + } + else if ( key == "staging" ) + { + tmp = reader->Get(); + SetStagingMemory(tmp); + } + else + { + LOG_WARN("Skipping parsing of unrecognised configuration option '{}' of section '{}'\n", key, section); + } + reader->End(); + } + } + else + { + LOG_WARN("Skipping parsing of unrecognised configuration section '{}'\n", section); + return IniParseResult::Unknown; + } + + return IniParseResult::Done; +} + + +bool Architecture::ParseMemory(IniReader *reader, const std::string §ion) +{ + std::string name; + Address size = MaxAddress(); + float bandwidth = 1; + int readLatency = 0; + int writeLatency = 0; + int burstLength = 1; + int portIndex = 0; + + // Parse memory definition + std::string key; + while ( reader->Begin(key) ) + { + if ( key == "name" ) + { + name = reader->Get(); + } + else if ( key == "size" ) + { + size = reader->Get(); + std::string suffix; + if ( reader->Read(suffix) ) + { + if ( suffix == "kb" ) + { + size *= 1024; + } + else if ( suffix == "mb" ) + { + size *= 1024 * 1024; + } + } + } + else if ( key == "bandwidth" ) + { + bandwidth = std::max(0.0001f, reader->Get()); + } + else if ( key == "read_latency" ) + { + readLatency = std::max(0, reader->Get()); + } + else if ( key == "write_latency" ) + { + writeLatency = std::max(0, reader->Get()); + } + else if ( key == "burst_length" ) + { + burstLength = std::max(1, reader->Get()); + } + else if ( key == "port_index" ) + { + portIndex = std::max(0, reader->Get()); + } + else + { + LOG_WARN("Skipping parsing of unrecognised memory configuration option '{}'\n", key); + } + + reader->End(); + } + + // Add a named, sized, memory to the system memory map + if ( name.empty() ) + { + LOG_ERROR("Unable to parse memory configuration. All memories must have a name.\n"); + return false; + } + if ( (std::string("memory.") + name) != section ) + { + LOG_ERROR("Unable to parse memory configuration. All memories must have matching name key and section name.\n"); + return false; + } + else if ( _memories.count(name) ) + { + LOG_ERROR("Unable to parse memory configuration for '{}'. All memories must have a unique name.\n", name); + return false; + } + else if ( size <= 0 ) + { + LOG_ERROR("Unable to parse memory configuration for '{}' of size {} bytes. All memories must have size > 0.\n", name, size); + return false; + } + else + { + auto memory = std::make_unique(name, size); + memory->SetParameters(bandwidth, readLatency, writeLatency, burstLength, portIndex); + _memories[name] = std::move(memory); + } + + return true; +} + + +bool Architecture::CheckConfiguration(std::string &error) +{ + if ( !_featuremapMemory ) + { + error = "Feature Map memory not configured"; + return false; + } + if ( !_lutMemory ) + { + error = "LUT memory not configured"; + return false; + } + if ( !_stagingMemory ) + { + error = "Staging memory not configured"; + return false; + } + if ( !_readonlyMemory ) + { + error = "Readonly memory not configured"; + return false; + } + for ( auto &mem : _memories ) + { + if ( mem.second->SizeBytes() > MaxAddress() ) + { + error = "Configured memory size out of bounds for memory: " + mem.first; + return false; + } + } + + return true; +} + +} // namespace regor diff --git a/ethosu/regor/architecture/architecture.hpp b/ethosu/regor/architecture/architecture.hpp new file mode 100644 index 00000000..56bac80c --- /dev/null +++ b/ethosu/regor/architecture/architecture.hpp @@ -0,0 +1,391 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "common/bit_flags.hpp" +#include "common/data_type.hpp" +#include "common/ini_reader.hpp" +#include "common/numeric_util.hpp" +#include "common/reverse_type.hpp" +#include "common/scaling.hpp" +#include "common/shape.hpp" +#include "common/transpose_type.hpp" +#include "compiler/kernel.hpp" +#include "compiler/op_type.hpp" +#include "compiler/tensor_properties.hpp" +#include "mlw_encode.hpp" + +#include +#include +#include +namespace regor +{ + +class WeightEncoder; +class IRegisterCommandStreamGenerator; + +using Address = int64_t; + +enum class TensorFormat : int16_t +{ + Unknown = 0, + NHWC = 1, + NHCWB16 = 2, + WeightsEncoded = 3, +}; + +/// +/// Architecture connected-memory definition +/// +struct ArchitectureMemory +{ +protected: + std::string _name; + Address _sizeBytes = 0; + float _bandwidthPerCycle = 1; + int _readLatencyCycles = 0; + int _writeLatencyCycles = 0; + int _maxBurstLengthBytes = 0; + int _portIndex = 0; + +public: + ArchitectureMemory(const std::string &name, Address sizeBytes) : _name(name), _sizeBytes(sizeBytes) {} + +public: + void SetParameters(float bandwidth, int readLatency, int writeLatency, int maxBurstLengthBytes, int portIndex) + { + _bandwidthPerCycle = bandwidth; + _readLatencyCycles = readLatency; + _writeLatencyCycles = writeLatency; + _maxBurstLengthBytes = maxBurstLengthBytes; + _portIndex = portIndex; + } + + float Bandwidth() const { return _bandwidthPerCycle; } // Assume read/write symmetrical + int ReadLatency() const { return _readLatencyCycles; } + int WriteLatency() const { return _writeLatencyCycles; } + Address SizeBytes() const { return _sizeBytes; } + int MaxBurstLength() const { return _maxBurstLengthBytes; } + std::string Name() const { return _name; } +}; + +enum class MemUsage : uint16_t +{ + None = 0, + ReadOnly = 0x1, + FeatureMap = 0x2, + LUT = 0x4, + Staging = 0x8, +}; + +struct MemArea +{ + ArchitectureMemory *memory = nullptr; // The physical memory area + Flags usage; // Usage partition within the memory area + + MemArea() = default; + MemArea(ArchitectureMemory *architectureMemory, MemUsage memUsage) : memory(architectureMemory), usage(memUsage) {} + + bool operator==(const MemArea &other) const { return memory == other.memory && usage == other.usage; } + bool operator!=(const MemArea &other) const { return !operator==(other); } + explicit operator bool() const { return (memory != nullptr) && (usage != MemUsage::None); } + + struct hash + { + size_t operator()(const MemArea &memArea) const { return size_t(memArea.memory) | size_t(memArea.usage); } + }; +}; + +/// +/// Per-operator architecture configuration base +/// +class ArchitectureOpConfig +{ +public: + virtual ~ArchitectureOpConfig() = default; + virtual std::unique_ptr Clone() = 0; + virtual int MaxIFMBuffering() = 0; + virtual Point2i OptimalStripeGranule() = 0; + virtual int OptimalDepthGranule() = 0; + virtual std::string ToString(bool full) = 0; +}; + +enum class ArchResampling : uint8_t +{ + None = 0, + Nearest = 1, + Zeros = 2, +}; + +enum class ArchResizeMode : uint8_t +{ + Nearest = 0, + Bilinear = 1, + Replicate = 2, +}; + +/// +/// Description of a candidate op to add to a ArchitectureOpGroup +/// +struct ArchitectureOpGroupQuery +{ + struct TensorInfo + { + UniqueId key; + DataType type; + }; + + OpType type; + const Kernel *kernel; + TensorInfo ifm; + TensorInfo ifm2; + TensorInfo ofm; +}; + +/// +/// Group of ops that can be fused and/or chained +/// +class ArchitectureOpGroup +{ +public: + virtual ~ArchitectureOpGroup() = default; + virtual int Add(const ArchitectureOpGroupQuery &op, const std::vector &dependsOn = {}) = 0; + +protected: + virtual bool CanRunOnNPU(const ArchitectureOpGroupQuery &op) = 0; +}; + +enum class ArchAccumulatorSource : uint8_t +{ + Reset = 0, + Acc = 1, + Ifm2 = 2 +}; + +/// +/// Query Information to retrieve HW specific operator config +/// +struct ArchitectureConfigQuery +{ + Shape ofmShape; + Shape ifmShape[2]; + int ifmBits; + Kernel *kernel; + int lutBytes; + bool scaled; + ArchResampling ifmResampling; + TransposeType transpose; + ReverseType reverse; + TensorFormat ofmFormat; + WeightFormat weightFormat; + ArchAccumulatorSource accSource = ArchAccumulatorSource::Reset; + bool accOutputEnabled = true; + struct Rescale + { + GraphApi::FractionND scaleY = {1, 1}; + GraphApi::FractionND scaleX = {1, 1}; + } rescaling; +}; + +/// +/// Information for querying operation performance +/// +struct PerformanceQuery +{ + OpType type; + Kernel *kernel; + ArchitectureOpConfig *config; + Shape ifmShape[2]; + ArchitectureMemory *ifmMemory[2]; + DataType ifmType[2]; + TensorFormat ifmFormat[2]; + Shape ofmShape; + ArchitectureMemory *ofmMemory; + DataType ofmType; + TensorFormat ofmFormat; + Shape constShape; + ArchitectureMemory *constMemory; +}; + +/// +/// Information for querying performance for HW fused operations +/// +struct FusionQuery +{ + OpType type; + Kernel *kernel = nullptr; + Shape ifm2Shape; + ArchitectureMemory *ifm2Memory = nullptr; + DataType ifm2Type; + TensorFormat ifm2Format; +}; + +/// +/// Information for querying support for Resize +/// +struct ResizeSupportQuery +{ + ArchResizeMode mode; + GraphApi::FractionND scaleY; + GraphApi::FractionND scaleX; + int offsetY; + int offsetX; + Shape ifmShape; +}; + +/// +/// Information for querying whether an operation can be executed by the architecture +/// +struct ExecutionQuery +{ + DataType ifmType[2]; + DataType ofmType; + const Kernel *kernel; +}; + +/// +/// Cycle cost of performing an operation +/// +struct CycleCost +{ + int64_t opCycles = 0; + int64_t macs = 0; +}; + +/// +/// How elements are accessed during an operation +/// +struct ElementAccess +{ + int ifmRead[2] = {0, 0}; + int ofmWrite = 0; + int weightsRefetch = 0; + int constRead[2] = {0, 0}; +}; + +/// +/// Architecture performance interface +/// +class ArchitecturePerformance +{ +public: + virtual ~ArchitecturePerformance() = default; + virtual CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) = 0; + virtual int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) = 0; + virtual ElementAccess MeasureElementAccess(const PerformanceQuery &query) = 0; + virtual ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) = 0; +}; + +enum class IniParseResult +{ + Unknown = 0, + Done, + Error, +}; + +enum class AxisMask +{ + None = 0, + AxisY = 1, +}; + +/// +/// ArchitectureFeatures base +/// +class Architecture +{ +protected: + std::unordered_map> _memories; + ArchitectureMemory *_readonlyMemory = nullptr; + ArchitectureMemory *_featuremapMemory = nullptr; + ArchitectureMemory *_lutMemory = nullptr; + ArchitectureMemory *_stagingMemory = nullptr; + +public: + virtual ~Architecture() = default; + virtual bool ParseConfig(IniReader *reader) = 0; + virtual bool CheckConfiguration(std::string &error); + virtual std::unique_ptr GetOpConfig(OpType opType, const ArchitectureConfigQuery &query) = 0; + virtual std::unique_ptr CreateOpGroup(const ArchitectureOpGroupQuery &op) = 0; + virtual class WeightEncoder *WeightEncoder() = 0; + virtual ArchitecturePerformance *Performance() = 0; + virtual IRegisterCommandStreamGenerator *RegisterCommandStreamGenerator() = 0; + virtual TensorFormat IdealBufferingFormat() { return TensorFormat::Unknown; } + virtual Address MaxAddress() = 0; + virtual std::vector ConfigRegisters() = 0; + virtual uint32_t Version() = 0; + virtual int UpscaleAndRounding(ArchResampling resampling, int &rounding) = 0; + virtual AxisMask CanSubdivide(OpType opType) = 0; + virtual bool SupportsLeakyRelu(bool quantized, DataType type) = 0; + virtual bool SupportsMatMul(OpType opType) = 0; + virtual bool SupportsTranspose(OpType opType, TransposeType transposeType) = 0; + virtual bool SupportsReverse(OpType opType, ReverseType reverseType) = 0; + virtual bool SupportsGather(OpType opType) = 0; + virtual bool SupportsScatter(OpType opType) = 0; + virtual bool SupportsSigmoidTanhLutInt16(OpType opType) = 0; + virtual bool SupportsResize(const ResizeSupportQuery &query) = 0; + virtual bool SupportsAccumulatorMode(ArchAccumulatorSource source, bool outputEnabled) = 0; + virtual bool SupportsScalar(OpType opType, DataType dataType, TensorUsage usage) = 0; + virtual bool SupportsArgMax(OpType opType) = 0; + virtual Flags SupportedWeightFormat(OpType op) = 0; + + MemArea ReadonlyMemory() + { + assert(_readonlyMemory); + return MemArea(_readonlyMemory, MemUsage::ReadOnly); + } + MemArea FeatureMapMemory() + { + assert(_featuremapMemory); + Flags usage = MemUsage::FeatureMap; + if ( _featuremapMemory == _stagingMemory ) + { + usage |= MemUsage::Staging; + } + return MemArea(_featuremapMemory, usage); + } + MemArea LUTMemory() + { + assert(_lutMemory); + return MemArea(_lutMemory, MemUsage::LUT); + } + MemArea StagingMemory() + { + assert(_stagingMemory); + Flags usage = MemUsage::Staging; + if ( _featuremapMemory == _stagingMemory ) + { + usage |= MemUsage::FeatureMap; + } + return MemArea(_stagingMemory, usage); + } + + IniParseResult ParseSection(const std::string §ion, IniReader *reader); + // Select named memories + void SetReadonlyMemory(const std::string &name) { _readonlyMemory = _memories.at(name).get(); } + void SetFeatureMapMemory(const std::string &name) { _featuremapMemory = _memories.at(name).get(); } + void SetStagingMemory(const std::string &name) { _stagingMemory = _memories.at(name).get(); } + +private: + bool ParseMemory(IniReader *reader, const std::string §ion); +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethos_u_register_cs_generator.hpp b/ethosu/regor/architecture/ethos_u_register_cs_generator.hpp new file mode 100644 index 00000000..9be69e0a --- /dev/null +++ b/ethosu/regor/architecture/ethos_u_register_cs_generator.hpp @@ -0,0 +1,66 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "register_command_stream_generator.hpp" + +namespace regor +{ + +template +class EthosURegisterCSGenerator : public IRegisterCommandStreamGenerator +{ +public: + void PrintCommandStream(const std::vector &stream, std::vector> &debugInfo) override + { + LOG_PRINT("Register command stream: {} words\n", stream.size()); + LOG_PRINT("{0:>8}: {1:8}{2:4} {3:4} - {4:30} {5:5}, {6}\n", " Offset", "Payload", "Param", "Code", "Command", "Param", "Fields"); + + size_t debugInfoIdx = 0; + for ( unsigned i = 0; i < stream.size(); ) + { + if ( debugInfoIdx < debugInfo.size() && debugInfo[debugInfoIdx].first == i ) + { + LOG_PRINT("// {}\n", debugInfo[debugInfoIdx++].second); + } + const uint32_t *d = &stream[i]; + std::string op; + std::vector> fields; + int nrWords = static_cast(this)->Disassemble(d, op, fields); + uint32_t code = *d & 0xffff; + uint32_t par = *d >> 16; + uint32_t payload = 0; + if ( nrWords == 2 && i + 1 < stream.size() ) + { + payload = stream[i + 1]; + } + const auto &intr = nrWords == 2 ? fmt::format("{:08x}", payload) : fmt::format("{:8}", ""); + LOG_PRINT("{0:#08x}: {1} {2:04x} {3:04x} - {4:30} {5:5}", i * sizeof(uint32_t), intr, par, code, op, par); + i += nrWords; + for ( auto &f : fields ) + { + LOG_PRINT(", {} = {}", f.first, f.second); + } + LOG_PRINT("\n"); + } + } +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethos_u_scaling.cpp b/ethosu/regor/architecture/ethos_u_scaling.cpp new file mode 100644 index 00000000..3f2b17ad --- /dev/null +++ b/ethosu/regor/architecture/ethos_u_scaling.cpp @@ -0,0 +1,69 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u_scaling.hpp" + +#include "common/numeric_util.hpp" + +#include +#include +#include +#include + +namespace regor +{ + +void QuantizePoolingScale(int kernelElements, double rescale, int rescaleBits, uint32_t &scale, int &shift, int N) +{ + int exp; + std::frexp(float(kernelElements - 1), &exp); + // N = scale instruction register size + int n = (N - 1) - rescaleBits; + scale = uint32_t(std::ceil(rescale * double(((1ULL << (n + exp)) + (1ULL << exp)) / kernelElements))); + shift = n + exp; + assert(unsigned(shift) < 64); +} + +void QuantizePoolingScaleMaxPrecision(int kernelElements, double rescale, uint32_t &scale, int &shift, int N) +{ + int rescaleBits = 0; + // if rescale != 1, scale need to consider the number of bits needed for rescaling + if ( rescale > 1 ) + { + rescaleBits = IntLog2(rescale) + 2; + } + else if ( rescale < 1 ) + { + rescaleBits = -IntLog2(1.0 / rescale); + } + QuantizePoolingScale(kernelElements, rescale, rescaleBits, scale, shift, N); +} + +// Simplified version of calculating elementwise Add/Sub scales +void SimplifiedElementwiseAddSubScale(double input1Scale, double input2Scale, double outputScale, int inputShift, + double &input1Rescale, double &input2Rescale, QuantizedScale &outScale) +{ + auto m = 2 * std::max(input1Scale, input2Scale); + auto f = double(int64_t(1) << inputShift); + input1Rescale = input1Scale * f / m; + input2Rescale = input2Scale * f / m; + double outputRescale = m / (outputScale * f); + outScale = QuantizedScale(outputRescale); +} + +} // namespace regor diff --git a/ethosu/regor/architecture/ethos_u_scaling.hpp b/ethosu/regor/architecture/ethos_u_scaling.hpp new file mode 100644 index 00000000..1cbb7c55 --- /dev/null +++ b/ethosu/regor/architecture/ethos_u_scaling.hpp @@ -0,0 +1,37 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/scaling.hpp" + +#include + +namespace regor +{ + +void QuantizePoolingScale(int kernelElements, double rescale, int rescaleBits, uint32_t &scale, int &shift, int N); + +// Max scale precision based on register size N (32 or 31) +void QuantizePoolingScaleMaxPrecision(int kernelElements, double rescale, uint32_t &scale, int &shift, int N); + +// Simplified version of calculating elementwise Add/Sub scales +void SimplifiedElementwiseAddSubScale(double input1Scale, double input2Scale, double outputScale, int inputShift, + double &input1Rescale, double &input2Rescale, QuantizedScale &outScale); + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp new file mode 100644 index 00000000..e973170c --- /dev/null +++ b/ethosu/regor/architecture/ethosu55/ethos_u55.cpp @@ -0,0 +1,932 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u55.hpp" + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "common/bit_flags.hpp" +#include "common/numeric_util.hpp" +#include "ethos_u55_performance.hpp" +#include "ethos_u55_register_cs_generator.hpp" +#include "ethos_u55_weight_encoder.hpp" + +#include +#include +#include +#include +#include + +BEGIN_ENUM_TABLE(regor::EthosU55SHRamElements) + ADD_ENUM_NAME(SHRAM_IFM8) + ADD_ENUM_NAME(SHRAM_IFM16) + ADD_ENUM_NAME(SHRAM_IFM8_Elementwise) + ADD_ENUM_NAME(SHRAM_IFM16_Elementwise) + ADD_ENUM_NAME(SHRAM_IFM32) + ADD_ENUM_NAME(SHRAM_Acc16) + ADD_ENUM_NAME(SHRAM_Acc32) + ADD_ENUM_NAME(SHRAM_Acc40) +END_ENUM_TABLE() + +BEGIN_ENUM_TABLE(regor::EthosUTraversal) + ADD_ENUM_NAME(DepthFirst) + ADD_ENUM_NAME(PartKernel) + ADD_ENUM_NAME(Depthwise) +END_ENUM_TABLE() + +namespace regor +{ + +static const EthosU55PerfInfo s_EthosU55PerfInfo[] = { + // Accelerator.Ethos_U55_32 + {{2.0, 3.0, 3.0, 3.0, 4.0, 6.0, 1.0, 2.0}, {1.0, 1.0, 0.0}}, + // Accelerator.Ethos_U55_64 + {{1.0, 1.5, 1.5, 1.5, 2.0, 3.0, 0.5, 1.0}, {1.0, 1.0, 0.0}}, + // Accelerator.Ethos_U55_128 + {{0.75, 1.25, 0.75, 0.75, 1.0, 1.5, 0.25, 0.5}, {1.0, 0.5, 0.0}}, + // Accelerator.Ethos_U55_256 + {{0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25}, {1.0, 0.25, 0.0}}, +}; + +static const ArchEthosU55::AcceleratorConfig s_EthosU55Configs[] = { + // Accelerator.Ethos_U55_32 + {32, 1, Shape(1, 1, 4), Shape(1, 1, 8), 16, {2, 2, 2, 2, 4, 4, 4, 4}, 1, &s_EthosU55PerfInfo[0]}, + // Accelerator.Ethos_U55_64 + {64, 1, Shape(1, 1, 8), Shape(1, 1, 8), 16, {2, 2, 2, 2, 4, 4, 4, 8}, 2, &s_EthosU55PerfInfo[1]}, + // Accelerator.Ethos_U55_128 + {128, 1, Shape(1, 2, 8), Shape(1, 2, 8), 24, {4, 4, 4, 4, 8, 4, 8, 12}, 4, &s_EthosU55PerfInfo[2]}, + // Accelerator.Ethos_U55_256 + {256, 1, Shape(2, 2, 8), Shape(2, 2, 8), 48, {8, 8, 8, 8, 16, 8, 16, 20}, 8, &s_EthosU55PerfInfo[3]}, +}; + +enum class ElementwiseUsage +{ + No = 0, + Full = 1, + Scalar = 2, +}; + +static const int s_SHRAMElementBits[] = { + 8, // IFM8 + 16, // IFM16 + 8, // IFM8_Elementwise + 16, // IFM16_Elementwise + 32, // IFM32 + 16, // Acc16 + 32, // Acc32 + 40, // Acc40 +}; + +static_assert(std::size(s_SHRAMElementBits) == int(SHRAM_Last) + 1, "Bad element mapping"); + + +ArchEthosU55::ArchEthosU55() : _subkernelMax(8, 8, 65536), _ofmBlockMax(32, 64, 128) +{ + _weightEncoder = std::make_unique(this); + _rcsGenerator = std::make_unique(this); +} + +uint32_t ArchEthosU55::Version() +{ + return EthosU55RCSGenerator::IdRegister(); +} + +bool ArchEthosU55::ParseConfig(IniReader *reader) +{ + // Parse architecture configuration + std::string key; + int macs = 0; + while ( reader->Begin(key) ) + { + if ( key == "macs" ) + { + macs = reader->Get(); + } + reader->End(); + } + + // Find the requested MAC configuration for this accelerator + auto cfg = std::find_if(s_EthosU55Configs, std::cend(s_EthosU55Configs), + [&](const AcceleratorConfig &config) { return config.macs == macs; }); + if ( cfg == std::cend(s_EthosU55Configs) ) + { + assert(macs == 32 || macs == 64 || macs == 128 || macs == 256); + LOG_TRACE0("Unable to find U55 accelerator for macs={}", macs); + return false; + } + + ApplyConfig(cfg); + + return true; +} + +void ArchEthosU55::ApplyConfig(const AcceleratorConfig *cfg) +{ + // Basic configuration + _cores = cfg->cores; + _macs = cfg->macs; + _ifmUBlock = cfg->ifmUblock; + _ofmUBlock = cfg->ofmUBlock; + + // All SHRAM granules + _shramGranules = cfg->shramGranules; + + // Bank granules organised by bit width + _ifmBankGranules[0] = cfg->shramGranules[SHRAM_IFM8]; + _ifmBankGranules[1] = cfg->shramGranules[SHRAM_IFM16]; + _ifmBankGranules[2] = 0; + _ifmBankGranules[3] = cfg->shramGranules[SHRAM_IFM32]; + + // Elementwise bank granules organised by bit width + _ifmEWBankGranules[0] = cfg->shramGranules[SHRAM_IFM8_Elementwise]; + _ifmEWBankGranules[1] = cfg->shramGranules[SHRAM_IFM16_Elementwise]; + _ifmEWBankGranules[2] = 0; + _ifmEWBankGranules[3] = cfg->shramGranules[SHRAM_IFM32]; + + // SHRAM layout information + _shram.reservedOutputBanks = 2; + _shram.bankSizeBytes = 1024, _shram.totalBanks = cfg->shramBanks; + _shram.reservedEndBanks = (_shram.totalBanks > 16) ? 2 : 0; + + _shramMemory = std::make_unique("shram", _shram.bankSizeBytes * _shram.totalBanks); + _shramMemory->SetParameters(1, 0, 0, 1, 0); // TODO: These figures make no sense for the shram (they depend on + // which HW unit is accessing the memory) + _lutMemory = _shramMemory.get(); + _performance = std::unique_ptr(new EthosU55Performance(this, cfg->perfInfo)); +} + + +std::unique_ptr ArchEthosU55::GetOpConfig(OpType opType, const ArchitectureConfigQuery &query) +{ + auto config = FindBlockConfig(opType, query); + return config; +} + + +std::unique_ptr ArchEthosU55::CreateOpGroup(const ArchitectureOpGroupQuery &op) +{ + LOG_TRACE1("Trying to create ArchEthosU55 OpGroup for {}\n", OpTypeToString(op.type)); + + auto group = std::make_unique(); + if ( !group->Add(op) ) + { + return nullptr; + } + + return group; +} + +std::vector ArchEthosU55::ConfigRegisters() +{ + return std::vector(1, ConfigRegister(0)); +} + +int ArchEthosU55::UpscaleAndRounding(ArchResampling resampling, int &rounding) +{ + rounding = (resampling == ArchResampling::Nearest) ? 1 : 0; + return (resampling == ArchResampling::None) ? 1 : 2; +} + +AxisMask ArchEthosU55::CanSubdivide(OpType opType) +{ + if ( IsConvolution(opType) || IsElementwise(opType) || IsPooling(opType) ) + { + return AxisMask::AxisY; + } + return AxisMask::None; +} + +bool ArchEthosU55::SupportsLeakyRelu(bool quantized, DataType type) +{ + return quantized == false && type == DataType::Int16; +} + +bool ArchEthosU55::SupportsMatMul(OpType opType) +{ + UNUSED(opType); + return false; +} + +bool ArchEthosU55::SupportsTranspose(OpType opType, TransposeType transposeType) +{ + UNUSED(opType); + UNUSED(transposeType); + return IsNone(transposeType); +} + +bool ArchEthosU55::SupportsReverse(OpType opType, ReverseType reverseType) +{ + UNUSED(opType); + UNUSED(reverseType); + return reverseType == ReverseType::None; +} + +bool ArchEthosU55::SupportsGather(OpType opType) +{ + UNUSED(opType); + return false; +} + +bool ArchEthosU55::SupportsScatter(OpType opType) +{ + UNUSED(opType); + return false; +} +bool ArchEthosU55::SupportsResize(const ResizeSupportQuery &query) +{ + UNUSED(query); + return false; +} + +bool ArchEthosU55::SupportsSigmoidTanhLutInt16(OpType opType) +{ + UNUSED(opType); + return false; +} + +bool ArchEthosU55::SupportsAccumulatorMode(ArchAccumulatorSource source, bool outputEnabled) +{ + return source == ArchAccumulatorSource::Reset && outputEnabled; +} + +bool ArchEthosU55::SupportsScalar(OpType opType, DataType dataType, TensorUsage usage) +{ + bool supportedType(dataType == DataType::Int8 || dataType == DataType::UInt8 || dataType == DataType::Int16); + return EthosU55RCSGenerator::IsSupportedElementwise(opType) && supportedType && IsIFM(usage); +} + +bool ArchEthosU55::SupportsArgMax(OpType opType) +{ + UNUSED(opType); + return false; +} + +Flags ArchEthosU55::SupportedWeightFormat(OpType) +{ + return WeightFormat::Default; +} + +bool ArchEthosU55::UseAvgPoolNop(OpType type) +{ + return IsActivation(type) || type == OpType::Quantize || type == OpType::MemoryCopy; +} + +static bool ChooseKernelMethod(const Shape &ifmShape, int ifmBits, const Kernel *kernel) +{ + if ( ifmShape.Depth() <= 8 ) + { + return true; + } + + // Compare part-kernel to depth-kernel and choose the one with best utilisation + int kernelElements = kernel->ElementsWH(); + double depthUtilisation = ifmShape.Depth() / double(RoundAway(ifmShape.Depth(), ifmBits == 8 ? 32 : 16)); + double partUtilisation = + (ifmShape.Depth() / double(RoundAway(ifmShape.Depth(), 8)) * + (kernelElements / double(RoundAway(kernelElements, ifmBits == 8 ? 4 : 2)))); + + return partUtilisation >= depthUtilisation; +} + + +static Shape GetArchIFMBlockSize(const Shape &ofmBlock, const Kernel *kernel, const Shape &ublock, + const Shape &subkernelLimit, int upscale, int rounding) +{ + Point2i dilatedSize = kernel->DilatedWH(); + + // IFM block height + int h = RequiredInputSize(ofmBlock.Height(), kernel->Stride().y, std::min(dilatedSize.y, subkernelLimit.Height()), upscale, rounding); + h = RoundAway(h, ublock.Height()); + + // IFM block width + int w = RequiredInputSize(ofmBlock.Width(), kernel->Stride().x, std::min(dilatedSize.x, subkernelLimit.Width()), upscale, rounding); + w = RoundAway(w, ublock.Width()); + + return Shape(1, h, w, ofmBlock.Depth()); +} + + +static Shape FitBlockForOFM(const Shape &ofmShape, const Kernel *kernel, const Shape &block, int ublockHeight) +{ + // 256/512 __Conv1D__ optimisation (ratio of IFM:Accumulators changes) This is a specific + // interpretation of a more general constraint that can't be applied because the + // FindBlockConfig function must return block configs that can be applied to any OFM shape. + if ( (ofmShape.Height() == 1) && (kernel->Size().y == 1) && (ublockHeight == 2) ) + { + return Shape(1, std::min(block.Height(), ofmShape.Height()), block.Width(), block.Depth()); + } + return block; +} + + +std::unique_ptr ArchEthosU55::FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query) +{ + assert(query.ifmBits > 0 && query.ifmBits <= 32); + assert(query.ofmShape.Size() > 2 && "Insufficient dimensions to search for block config"); + assert(query.kernel != nullptr); + + if ( !SupportsAccumulatorMode(query.accSource, query.accOutputEnabled) ) return nullptr; + + const int OFMSplitDepth = 16; // Specific to this architecture + + // Elementwise larger-volume correction + const Shape &ifmShape = (query.ifmShape[1].Elements() > query.ifmShape[0].Elements()) ? query.ifmShape[1] : query.ifmShape[0]; + + EthosU55NpuOp npuOp = GetHWOp(opType); + assert(npuOp != EthosU55NpuOp::None); + + // Figure out if SHRAM should be portioned for elementwise + ElementwiseUsage ewUsage = ElementwiseUsage::No; + if ( npuOp == EthosU55NpuOp::Elementwise ) + { + bool usesScalar = query.ifmShape[0].Elements() == 1; + if ( query.ifmShape[1].IsValid() ) + { + usesScalar = usesScalar || query.ifmShape[1].Elements() == 1; + } + + ewUsage = (usesScalar && (query.ifmBits <= 16)) ? ElementwiseUsage::Scalar : ElementwiseUsage::Full; + } + + // Operator typing help + bool isPooling = npuOp == EthosU55NpuOp::Pooling || npuOp == EthosU55NpuOp::ReduceSum; + bool isReduceSum = npuOp == EthosU55NpuOp::ReduceSum; + bool isDepthwise = npuOp == EthosU55NpuOp::Depthwise; + bool isEqualDepthOp = (ewUsage != ElementwiseUsage::No) || (isPooling && !isReduceSum) || isDepthwise; + bool isPartKernel = npuOp == EthosU55NpuOp::Convolution && ChooseKernelMethod(ifmShape, query.ifmBits, query.kernel); + + // Operator configuration to be returned + auto config = std::make_unique(); + config->_bankSize = _shram.bankSizeBytes; + // IFM is not broadcasted for pooling and depthwise ops and for elementwise + // when there's no elementwise-broadcasting in depth + int ifmDepthBufScaling = + isPooling || isDepthwise || IsUnaryElementwise(opType) || + (IsBinaryElementwise(opType) && (query.ifmShape[0].Depth() == query.ifmShape[1].Depth())) ? + _cores : + 1; + config->_ifmDepthBufScaling = ifmDepthBufScaling; + config->_traversal = isDepthwise ? EthosUTraversal::Depthwise : (isPartKernel ? EthosUTraversal::PartKernel : EthosUTraversal::DepthFirst); + + // Accumulator & granule settings + EthosU55SHRamElements accType = SHRAM_Acc32; + if ( query.ifmBits == 16 && (!isPooling || isReduceSum) && query.scaled ) + { + accType = SHRAM_Acc40; + } + config->_accumulatorType = accType; + + // Memory rounding granules + int accGranule = _shramGranules[accType]; + int accBits = s_SHRAMElementBits[accType]; + int ifmGranule = 0; + if ( ewUsage != ElementwiseUsage::No ) + { + ifmGranule = _ifmEWBankGranules[query.ifmBits / 8 - 1]; + } + else + { + ifmGranule = _ifmBankGranules[query.ifmBits / 8 - 1]; + } + + int rounding; + int upscale = UpscaleAndRounding(query.ifmResampling, rounding); + int lutBanks = std::max(DivRoundUp(query.lutBytes, 1024), _shram.reservedEndBanks); + + // Subkernel repeats of the IFM + Point2i dilatedWH = query.kernel->DilatedWH(); + int ifmRepeats = DivRoundUp(dilatedWH.x, _subkernelMax.Width()) * DivRoundUp(dilatedWH.y, _subkernelMax.Height()); + + int ifmBlockDepth = 0; + if ( query.ifmBits == 16 ) + { + ifmBlockDepth = RoundAway(std::min(ifmShape.Depth(), 16), 4); + } + else + { + ifmBlockDepth = RoundAway(std::min(ifmShape.Depth(), isPartKernel ? 16 : 32), _ifmUBlock.Depth()); + } + + // Weights fetch (for operators that have them) + bool hasWeights = npuOp == EthosU55NpuOp::Convolution || isDepthwise; + int weightFetchWH = hasWeights ? query.kernel->Size().AreaXY() : 0; + + int ofmUBlockDepth = _ofmUBlock.Depth() * _cores; + Shape searchSpace = Shape::RoundAway(Shape::Min(query.ofmShape, _ofmBlockMax), _ofmUBlock.WithDepth(ofmUBlockDepth)); + + // Block WHC search, loops across the search space looking for best efficiency + float bestCost = std::numeric_limits::infinity(); + float bestCoverage = std::numeric_limits::infinity(); + int ofmElements = query.ofmShape.Elements(); + + int depth = std::max(ofmUBlockDepth, std::min(searchSpace.Depth(), OFMSplitDepth)); + if ( depth < query.ofmShape.Depth() ) + { + depth = RoundAway(depth, OFMSplitDepth); + } + + std::unordered_set> wontFit; + while ( depth <= searchSpace.Depth() ) + { + for ( int height = _ofmUBlock.Height(); height <= searchSpace.Height(); height += _ofmUBlock.Height() ) + { + for ( int width = _ofmUBlock.Width(); width <= searchSpace.Width(); width += _ofmUBlock.Width() ) + { + // Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't + // fit, then 4x8x16 won't either. + if ( wontFit.count(Point2i(height, width)) > 0 ) + { + continue; + } + + // Calculate the IFM block dimensions required to feed this OFM block + Shape ofmBlock = Shape(height, width, depth); + Shape ifmBlock = GetArchIFMBlockSize(ofmBlock, query.kernel, _ofmUBlock, _subkernelMax, upscale, rounding); + if ( !isEqualDepthOp ) + { + ifmBlock[-1] = ifmBlockDepth; + } + + ofmBlock = FitBlockForOFM(query.ofmShape, query.kernel, ofmBlock, _ofmUBlock.Height()); + + // Test if the IFM/OFM blocks fit into SHRAM + EthosU55OpConfig::SHRAMLayout layout; + if ( TryBlockConfig(layout, int(ewUsage), ofmBlock, ifmBlock, query.ifmBits, ifmGranule, accBits, accGranule, lutBanks, ifmDepthBufScaling) ) + { + Shape fullBlocks = Shape::DivRoundUp(query.ofmShape, ofmBlock); + Point3 blocks = query.ofmShape.HWC() / ofmBlock.HWC(); + + // Weights fetching + float weightFetch = float(weightFetchWH * ifmShape.Depth() * fullBlocks.ElementsWH()); + if ( !isDepthwise ) + { + weightFetch *= blocks.z * float(ofmBlock.Depth()); + } + + // IFM fetching + float ifmFetch = float(ifmBlock.ElementsWH() * ifmShape.Depth() * ifmRepeats) * blocks.x * blocks.y; + if ( !isEqualDepthOp ) + { + ifmFetch *= float(fullBlocks.Depth()); + } + + // Scale relative to every output OFM element + float relativeCost = + npuOp == EthosU55NpuOp::Elementwise ? float(ofmElements) / float(height * width * depth) : (ifmFetch + weightFetch) / float(ofmElements); + + // If the entire IFM can be encompassed by both buffers, bias to prefer this configuration + if ( ifmShape.Elements() < ifmBlock.Elements() * 2 ) + { + relativeCost = relativeCost / 2.0f; + } + + // Choose based on relative minimum cost or larger IFM area (if equal cost) + if ( relativeCost <= bestCost ) + { + bool chooseThis = false; + // Check IFM coverage only when it's equal best_cost and small OFM + if ( relativeCost == bestCost ) + { + Shape coverageShape = Shape::Min(ifmShape, ifmBlock); + float coverage = float(ifmShape.ElementsWH()) / float(coverageShape.ElementsWH()); + // Small 4x4 IFM constraint found through analysis of networks + if ( coverage <= bestCoverage && (height <= 4 && width <= 4) ) + { + bestCoverage = coverage; + chooseThis = true; + } + } + else + { + bestCoverage = std::numeric_limits::infinity(); + chooseThis = true; + } + + if ( chooseThis ) + { + bestCost = relativeCost; + config->_layout = layout; + config->_ifmBlock = ifmBlock; + config->_ofmBlock = Shape(1, height, width, depth); + } + } + } + else + { + wontFit.emplace(width, height); + } + } + } + + // Try Next block depth, rounded + depth = depth + ofmUBlockDepth; + if ( depth < query.ofmShape.Depth() ) + { + depth = RoundAway(depth, OFMSplitDepth); + } + } + + // Return the best configuration + if ( bestCost != std::numeric_limits::infinity() ) + { + return std::unique_ptr(config.release()); + } + + // Didn't find a configuration + return std::unique_ptr(); +} + + +bool ArchEthosU55::TryBlockConfig(EthosU55OpConfig::SHRAMLayout &layout, int ewUsage, const Shape &ofmBlock, + const Shape &ifmBlock, int ifmBits, int ifmGranule, int accBits, int accGranule, int lutBanks, int ifmDepthBufScaling) +{ + assert((accBits > 0) && (accGranule > 0)); + assert((ifmBits >= 8) && ((ifmBits % 8) == 0) && (ifmGranule > 0)); + + // Scale depth with cores + int ifm_depth = DivRoundUp(ifmBlock.Depth(), ifmDepthBufScaling); + int ofm_depth = DivRoundUp(ofmBlock.Depth(), _cores); + + // Always need IFM space + int ifm_bytes = ifmBlock.ElementsWH() * RoundAway(ifm_depth * (ifmBits / 8), 8); + int ifm_banks = DivRoundUp(ifm_bytes, _shram.bankSizeBytes) * 2; + ifm_banks = RoundAway(ifm_banks, ifmGranule); + + // Calculate SHRAM boundaries of the IFM and Accumulators + int lut_start = _shram.totalBanks - lutBanks; + int ifm_end = _shram.reservedOutputBanks + ifm_banks; + int ifm2_start = ifm_end; + int acc_start = lut_start; + + // If not elementwise then we need accumulator space + if ( ewUsage == int(ElementwiseUsage::No) ) + { + int acc_bytes = (ofmBlock.ElementsWH() * RoundAway(ofm_depth, 8) * accBits) / 8; + int acc_banks = DivRoundUp(acc_bytes, _shram.bankSizeBytes) * 2; + acc_banks = RoundAway(acc_banks, accGranule); + acc_start = acc_start - acc_banks; + } + else + { + int ifm2_banks = (ewUsage == int(ElementwiseUsage::Full)) ? ifm_banks : 0; + if ( ifm2_start + ifm2_banks > acc_start ) + { + return false; + } + ifm_end = acc_start; + } + + // IFM must still fit before accumulators + if ( ifm_end > acc_start ) + { + return false; + } + + // Should all fit, so return this layout + layout.ibStart = _shram.reservedOutputBanks; + layout.ibStart2 = ifm2_start; + layout.ibEnd = ifm_end; + layout.abStart = acc_start; + layout.lutStart = lut_start; + return true; +} + + +Shape ArchEthosU55::GetStorageRounding(TensorFormat format) +{ + if ( format == TensorFormat::NHCWB16 ) + { + return Shape(1, 1, 1, 16); + } + + return Shape(1, 1, 1, 1); +} + + +uint32_t ArchEthosU55::ConfigRegister(int product) +{ + uint32_t macs = _macs * _cores; + int macsCeilLog2 = 0; + while ( macs >>= 1 ) + { + macsCeilLog2++; + } + int shramSize = _cores * (int(_shramMemory->SizeBytes()) >> 10); + assert(macsCeilLog2 < 16); + assert(shramSize < 256); + return macsCeilLog2 | (shramSize << 8) | (product << 28); +} + +std::unique_ptr EthosU55OpConfig::Clone() +{ + auto config = std::make_unique(); + config->_bankSize = _bankSize; + config->_ifmDepthBufScaling = _ifmDepthBufScaling; + config->_traversal = _traversal; + config->_accumulatorType = _accumulatorType; + config->_ofmBlock = _ofmBlock; + config->_ifmBlock = _ifmBlock; + config->_layout = _layout; + return std::unique_ptr(config.release()); +} + +int EthosU55OpConfig::MaxIFMBuffering() +{ + return (_layout.ibEnd - _layout.ibStart) * _bankSize * _ifmDepthBufScaling; +} + +Point2i EthosU55OpConfig::OptimalStripeGranule() +{ + return _ofmBlock.WH(); +} + +int EthosU55OpConfig::OptimalDepthGranule() +{ + return _ofmBlock.Depth(); +} + +std::string EthosU55OpConfig::ToString(bool full) +{ + std::string tmp = fmt::format("OFM Block=[{}], IFM Block=[{}], Traversal={}, AccType={}", _ofmBlock.ToString(), + _ifmBlock.ToString(), EnumToString(_traversal), EnumToString(_accumulatorType)); + if ( full ) + { + tmp += fmt::format("\nSHRAM: ib={} ibE={}, ib2={}, ab={}, lut={}", _layout.ibStart, _layout.ibEnd, + _layout.ibStart2, _layout.abStart, _layout.lutStart); + } + return tmp; +} + +EthosU55NpuOp ArchEthosU55::GetHWOp(OpType type) +{ + static const std::unordered_map toNpuOp = { + {OpType::DepthwiseConv2DBias, EthosU55NpuOp::Depthwise}, + {OpType::Conv2D, EthosU55NpuOp::Convolution}, + {OpType::Conv2DBackpropInput, EthosU55NpuOp::Convolution}, + {OpType::Conv2DBackpropInputSwitchedBias, EthosU55NpuOp::Convolution}, + {OpType::Conv2DBias, EthosU55NpuOp::Convolution}, + {OpType::FullyConnected, EthosU55NpuOp::VectorProduct}, + {OpType::MaxPool, EthosU55NpuOp::Pooling}, + {OpType::AvgPool, EthosU55NpuOp::Pooling}, + {OpType::QuantizedAvgPool, EthosU55NpuOp::Pooling}, + {OpType::QuantizedMaxPool, EthosU55NpuOp::Pooling}, + {OpType::ResizeBilinear, EthosU55NpuOp::Pooling}, + {OpType::ReduceSum, EthosU55NpuOp::ReduceSum}, + }; + auto pos = toNpuOp.find(type); + if ( pos != toNpuOp.end() ) + { + return pos->second; + } + else if ( EthosU55RCSGenerator::IsSupportedElementwise(type) ) + { + return EthosU55NpuOp::Elementwise; + } + else if ( UseAvgPoolNop(type) ) + { + return EthosU55NpuOp::Pooling; + } + return EthosU55NpuOp::None; +} + +int EthosU55OpGroup::Add(const ArchitectureOpGroupQuery &op, const std::vector &dependsOn) +{ + LOG_TRACE1("Trying to add op {}\n", OpTypeToString(op.type)); + + if ( _opsCount >= 2 ) + { + // Can only fuse 2 ops + return 0; + } + + for ( int dep : dependsOn ) + { + if ( dep > 0 ) + { + // Don't validate user-specified (positive keys) dependencies + continue; + } + else if ( dep < 0 ) + { + // Convert to group generated keys (negative keys) to array index + dep = (-dep) - 1; + if ( dep >= _opsCount ) + { + // Missing dependency + return 0; + } + } + + const EthosU55OpGroup::OpInfo &prevOp = _ops[dep]; + if ( prevOp.ofm.key != op.ifm.key && prevOp.ofm.key != op.ifm2.key ) + { + // Can only fuse when ops are connected + return 0; + } + } + + if ( !CanRunOnNPU(op) ) + { + // Can only fuse NPU ops + return 0; + } + + if ( _opsCount > 0 && !IsActivation(op.type) ) + { + // Can only fuse with activation + return 0; + } + + // Generated key + int key = (-_opsCount) - 1; + + // Save copy of op + _ops[_opsCount].type = op.type; + _ops[_opsCount].ifm.key = op.ifm.key; + _ops[_opsCount].ifm.type = op.ifm.type; + _ops[_opsCount].ifm2.key = op.ifm2.key; + _ops[_opsCount].ifm2.type = op.ifm2.type; + _ops[_opsCount].ofm.key = op.ofm.key; + _ops[_opsCount].ofm.type = op.ofm.type; + _opsInternal[_opsCount].dependsOn = dependsOn; + _opsCount++; + + return key; +} + +// Table of allowed ifm/ofm data type combinations for each HWOp +static const std::unordered_map>> s_opDataTypeSupport = { + {EthosU55NpuOp::Convolution, // HWOp + { + // IFM data type | OFM data type(s) + {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + }}, + {EthosU55NpuOp::Depthwise, + { + {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + }}, + {EthosU55NpuOp::VectorProduct, + { + {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + }}, + {EthosU55NpuOp::Pooling, + { + {DataType::UInt8, {DataType::UInt8}}, + {DataType::Int8, {DataType::Int8}}, + {DataType::Int16, {DataType::Int16}}, + }}, + {EthosU55NpuOp::ReduceSum, + { + {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int32, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + }}, +}; + +bool EthosU55OpGroup::CanRunOnNPU(const ArchitectureOpGroupQuery &op) +{ + EthosU55NpuOp npuOp = ArchEthosU55::GetHWOp(op.type); + + if ( IsFloat(op.ifm.type | op.ifm2.type | op.ofm.type) ) + { + return false; + } + + if ( npuOp == EthosU55NpuOp::None ) + { + return false; + } + + auto k = op.kernel; + if ( k->Stride().x > 3 || k->Stride().y > 3 ) + { + return false; + } + + if ( k->Dilation().x > 2 || k->Dilation().y > 2 ) + { + return false; + } + + switch ( npuOp ) + { + case EthosU55NpuOp::Convolution: + case EthosU55NpuOp::Depthwise: + case EthosU55NpuOp::VectorProduct: + case EthosU55NpuOp::Pooling: + case EthosU55NpuOp::ReduceSum: + case EthosU55NpuOp::Elementwise: + break; + default: + assert(false && "Unrecognized HWOp"); + return false; + } + + // Check allowed ifm/ofm type mapping + if ( npuOp != EthosU55NpuOp::Elementwise ) + { + if ( op.type == OpType::LUT || op.type == OpType::MemoryCopy ) + { // TODO: LUT operations end up here due to UseAvgPoolNop although the rules are not the same as + // for a Pooling operation, so skip checks for now. + return true; + } + + auto map = s_opDataTypeSupport.find(npuOp); + if ( map == s_opDataTypeSupport.end() ) + { + assert(false && "Data type mapping for HWOp missing"); + return false; + } + auto &typeMap = map->second; + auto ifmEntry = typeMap.find(op.ifm.type); + if ( ifmEntry == typeMap.end() ) + { // Unsupported ifm data type + return false; + } + auto &ofmTypes = ifmEntry->second; + if ( 0 == std::count(ofmTypes.begin(), ofmTypes.end(), op.ofm.type) ) + { // Unsupported ofm data type + return false; + } + } + else + { + std::vector validIfmTypes; + std::vector validOfmTypes; + switch ( op.type ) + { + case OpType::Add: + case OpType::Sub: + case OpType::Mul: + { + validIfmTypes = {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}; + validOfmTypes = validIfmTypes; + } + break; + case OpType::Minimum: + case OpType::Maximum: + case OpType::LeakyRelu: + case OpType::Abs: + { + validIfmTypes = {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}; + validOfmTypes = {op.ifm.type}; + } + break; + case OpType::CLZ: + case OpType::SHL: + case OpType::Asr: + { + validIfmTypes = {DataType::Int32}; + validOfmTypes = {DataType::Int32}; + if ( op.type == OpType::Asr ) + { + validOfmTypes.insert(validOfmTypes.begin(), {DataType::UInt8, DataType::Int8, DataType::Int16}); + } + } + break; + default: + assert(false && "Unkown elementwise type"); + break; + } + + if ( 0 == std::count(validIfmTypes.begin(), validIfmTypes.end(), op.ifm.type) ) + { // Unsupported ifm data type + return false; + } + if ( IsBinaryElementwise(op.type) && op.ifm2.type != op.ifm.type ) + { // ifm2 data type must match ifm data type + return false; + } + if ( 0 == std::count(validOfmTypes.begin(), validOfmTypes.end(), op.ofm.type) ) + { // Unsupported ofm data type + return false; + } + } + + return true; +} + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55.hpp new file mode 100644 index 00000000..d83ec42a --- /dev/null +++ b/ethosu/regor/architecture/ethosu55/ethos_u55.hpp @@ -0,0 +1,239 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/architecture.hpp" +#include "architecture/ethos_u_scaling.hpp" +#include "architecture/register_command_stream_generator.hpp" +#include "architecture/weight_encoder.hpp" +#include "common/bit_flags.hpp" +#include "common/shape.hpp" +#include "ethos_u55_performance.hpp" + +#include + +namespace regor +{ + +enum EthosU55SHRamElements +{ + SHRAM_IFM8 = 0, + SHRAM_IFM16 = 1, + SHRAM_IFM8_Elementwise = 2, + SHRAM_IFM16_Elementwise = 3, + SHRAM_IFM32 = 4, + SHRAM_Acc16 = 5, + SHRAM_Acc32 = 6, + SHRAM_Acc40 = 7, + SHRAM_Last = SHRAM_Acc40 +}; + +enum class EthosUTraversal +{ + DepthFirst = 0, + PartKernel = 1, + Depthwise = 2, +}; + +class ArchEthosU55; + +enum class EthosU55NpuOp +{ + None = 0, + Convolution, + Depthwise, + VectorProduct, + Pooling, + ReduceSum, + Elementwise, +}; + +/// +/// Per-operator architecture configuration +/// +class EthosU55OpConfig : public ArchitectureOpConfig +{ + friend class ArchEthosU55; + friend class EthosU55RCSGenerator; + +public: + struct SHRAMLayout + { + int ibStart = 0; + int ibEnd = 0; + int ibStart2 = 0; + int abStart = 0; + int lutStart = 0; + }; + +private: + SHRAMLayout _layout; + Shape _ifmBlock; + Shape _ofmBlock; + EthosU55SHRamElements _accumulatorType = SHRAM_Acc32; + EthosUTraversal _traversal = EthosUTraversal::DepthFirst; + int _bankSize = 0; + int _ifmDepthBufScaling = 0; + +public: + EthosUTraversal Traversal() const { return _traversal; } + const Shape &IfmBlock() const { return _ifmBlock; } + const Shape &OfmBlock() const { return _ofmBlock; } + EthosU55SHRamElements Acc() const { return _accumulatorType; } + + std::unique_ptr Clone() override; + int MaxIFMBuffering() override; + Point2i OptimalStripeGranule() override; + int OptimalDepthGranule() override; + std::string ToString(bool full) override; +}; + +/// +/// Group of ops that can be fused and/or chained +/// +class EthosU55OpGroup : public ArchitectureOpGroup +{ + friend class ArchEthosU55; + + using OpInfo = ArchitectureOpGroupQuery; + + struct InternalOpInfo + { + std::vector dependsOn; + }; + +private: + std::array _ops; + std::array _opsInternal; + int _opsCount = 0; + +public: + int Add(const ArchitectureOpGroupQuery &op, const std::vector &dependsOn = {}) override; + +protected: + bool CanRunOnNPU(const ArchitectureOpGroupQuery &op) override; +}; + +/// +/// EthosU55 specialisation +/// +class ArchEthosU55 : public Architecture +{ + friend class EthosU55WeightEncoder; + friend class EthosU55Performance; + friend class EthosU55RCSGenerator; + friend class EthosU65RCSGenerator; + friend class EthosU55OpGroup; + +public: + struct AcceleratorConfig + { + int macs; + int cores; + Shape ofmUBlock; + Shape ifmUblock; + int shramBanks; + int8_t shramGranules[8]; + int elemUnits; + const EthosU55PerfInfo *perfInfo; + }; + +private: + std::unique_ptr _shramMemory; + Shape _subkernelMax; + Shape _ofmBlockMax; + int _cores = 0; + int _macs = 0; + Shape _ofmUBlock; + Shape _ifmUBlock; + const int8_t *_shramGranules = nullptr; + int _ifmBankGranules[4] = {0}; + int _ifmEWBankGranules[4] = {0}; + + struct + { + int reservedOutputBanks = 0; + int bankSizeBytes = 0; + int totalBanks = 0; + int reservedEndBanks = 0; + int lutBanks = 2; + int lutSlotSize = 256; + } _shram; + +protected: + std::unique_ptr _weightEncoder; + std::unique_ptr _performance; + std::unique_ptr _rcsGenerator; + +public: + ArchEthosU55(); + +public: + bool ParseConfig(IniReader *reader) override; + + std::unique_ptr GetOpConfig(OpType opType, const ArchitectureConfigQuery &query) override; + std::unique_ptr CreateOpGroup(const ArchitectureOpGroupQuery &op) override; + class WeightEncoder *WeightEncoder() override { return _weightEncoder.get(); } + IRegisterCommandStreamGenerator *RegisterCommandStreamGenerator() override { return _rcsGenerator.get(); } + ArchitecturePerformance *Performance() override { return _performance.get(); } + TensorFormat IdealBufferingFormat() override { return TensorFormat::NHCWB16; } + Address MaxAddress() override { return 1LL << 32; } + std::vector ConfigRegisters() override; + int UpscaleAndRounding(ArchResampling resampling, int &rounding) override; + AxisMask CanSubdivide(OpType opType) override; + bool SupportsLeakyRelu(bool quantized, DataType type) override; + bool SupportsMatMul(OpType opType) override; + bool SupportsTranspose(OpType opType, TransposeType transposeType) override; + bool SupportsReverse(OpType opType, ReverseType reverseType) override; + bool SupportsGather(OpType opType) override; + bool SupportsScatter(OpType opType) override; + bool SupportsSigmoidTanhLutInt16(OpType opType) override; + bool SupportsResize(const ResizeSupportQuery &query) override; + bool SupportsAccumulatorMode(ArchAccumulatorSource source, bool outputEnabled) override; + bool SupportsScalar(OpType opType, DataType dataType, TensorUsage usage) override; + bool SupportsArgMax(OpType opType) override; + Flags SupportedWeightFormat(OpType op) override; + uint32_t Version() override; + +protected: + Shape OfmUBlock() { return _ofmUBlock; } + void ApplyConfig(const AcceleratorConfig *cfg); + + std::unique_ptr FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query); + + bool TryBlockConfig(EthosU55OpConfig::SHRAMLayout &layout, int ewUsage, const Shape &ofmBlock, const Shape &ifmBlock, + int ifmBits, int ifmGranule, int accBits, int accGranule, int lutBanks, int ifmDepthBufScaling); + + Shape GetStorageRounding(TensorFormat format); + + uint32_t ConfigRegister(int product); + + bool IsU55_32() const { return (_macs == 32) && (_cores == 1); } + + // Checks if the operation is to be mapped on AvgPool + static bool UseAvgPoolNop(OpType type); + static EthosU55NpuOp GetHWOp(OpType type); + +private: + int MaxOutstandingKernelOps() { return 2; } + virtual int MaxOutstandingDMAOps() { return 1; } + int MaxBlockdep() { return 3; } +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp new file mode 100644 index 00000000..527e159b --- /dev/null +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.cpp @@ -0,0 +1,531 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u55_performance.hpp" + +#include "common/common.hpp" + +#include "architecture/architecture.hpp" +#include "ethos_u55.hpp" + +namespace regor +{ + +static const Point2i s_SubkernelLimits[] = { + {0, 0}, // No kernel + {8, 8}, // Convolution + {8, 8}, // Depthwise + {1, 1}, // VectorProduct + {8, 8}, // Pooling + {8, 8}, // ReduceSum + {1, 1}, // Elementwise +}; + +static constexpr bool OpUsesMacs(EthosU55NpuOp npuOp) +{ + return (npuOp != EthosU55NpuOp::Elementwise && npuOp != EthosU55NpuOp::None); +} + +EthosU55Performance::EthosU55Performance(ArchEthosU55 *arch, const EthosU55PerfInfo *perfInfo) : _arch(arch) +{ + _perfInfo = perfInfo; +} + +CycleCost EthosU55Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) +{ + CycleCost cycles; + auto npuOp = _arch->GetHWOp(query.type); + + // Convolution/Vector product cycle calculation + if ( OpUsesMacs(npuOp) ) + { + if ( (npuOp == EthosU55NpuOp::Depthwise) || (npuOp == EthosU55NpuOp::Pooling) ) + { + cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * 1; + } + else + { + cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * query.ifmShape[0].Depth(); + } + + cycles.opCycles = EstimateConvCycles(query, fused); + } + // Elementwise cycle calculation + else if ( npuOp == EthosU55NpuOp::Elementwise ) + { + auto ofmShape = + (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape; + cycles.opCycles = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmShape.Elements())); + } + else + { + assert(false && "Unknown operator cycle costing"); + } + + return cycles; +} + +int64_t EthosU55Performance::MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) +{ + int64_t fromCycles = int64_t(float(sizeBytes) / source->Bandwidth()); + fromCycles += source->ReadLatency(); + int64_t toCycles = int64_t(float(sizeBytes) / dest->Bandwidth()); + toCycles += source->WriteLatency(); + return std::max(fromCycles, toCycles); +} + +int64_t EthosU55Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused) +{ + EthosU55OpConfig *opConfig = static_cast(query.config); + auto npuOp = _arch->GetHWOp(query.type); + assert(npuOp != EthosU55NpuOp::None); + + Shape ifmBlock = Shape::Min(query.ifmShape[0], opConfig->IfmBlock()); + Shape ofmBlock = Shape::Min(query.ofmShape, opConfig->OfmBlock()); + Shape ofmUBlock = _arch->OfmUBlock(); + + // HW Optimisation check + if ( (ofmUBlock.Height() == 2) && (npuOp == EthosU55NpuOp::Convolution || npuOp == EthosU55NpuOp::VectorProduct) && + (query.ofmShape.Height() == 1) && (query.ofmShape.Width() % 2 == 0) && // Optimisation only applies for even + // width tensors + (query.kernel->Size().y == 1) ) + { + ofmUBlock = Shape(1, 1, 4, ofmUBlock.Depth()); + ofmBlock = ofmBlock.WithHeight(1); + } + + int ifmBits = DataTypeSizeBits(query.ifmType[0]); + Shape numUBlocks = Shape::DivRoundUp(ofmBlock, ofmUBlock); + bool use40BitAcc = opConfig->Acc() == EthosU55SHRamElements::SHRAM_Acc40; + + int64_t cyclesDpuBlk = 0; + int cyclesWb = 32 * ofmUBlock.Depth() / 8; + + int subKernelWidth = s_SubkernelLimits[int(npuOp)].x; + int subKernelHeight = s_SubkernelLimits[int(npuOp)].y; + const Point2i kernelSize = query.kernel->Size(); + bool isConvolutionMxN = (npuOp == EthosU55NpuOp::Convolution); + + for ( int x = 0; x < kernelSize.x; x += subKernelWidth ) + { + for ( int y = 0; y < kernelSize.y; y += subKernelHeight ) + { + int subKernelElements = std::min(kernelSize.y - y, subKernelHeight); + subKernelElements *= std::min(kernelSize.x - x, subKernelWidth); + + // Calculate processing cycles + int numKernelSteps = 0; + int cycles = 0; + if ( npuOp == EthosU55NpuOp::Pooling ) + { + numKernelSteps = 1; + cycles = std::max(4, subKernelElements) * numUBlocks.Elements(); + if ( !_arch->IsU55_32() ) + { + cycles = cycles * (ifmBits / 2); + } + } + else if ( npuOp == EthosU55NpuOp::Depthwise ) + { + numKernelSteps = DivRoundUp(subKernelElements, 4); + cycles = 4 * numUBlocks.ElementsWH() * (ifmBits / 8); + cycles = std::max(cyclesWb, cycles) * numKernelSteps * numUBlocks.Depth(); + } + else if ( (isConvolutionMxN && opConfig->Traversal() != EthosUTraversal::PartKernel) || + npuOp == EthosU55NpuOp::VectorProduct || npuOp == EthosU55NpuOp::ReduceSum ) + { + numKernelSteps = subKernelElements; + cycles = std::max(cyclesWb, 4 * numUBlocks.ElementsWH()) * numKernelSteps * numUBlocks.Depth(); + } + else + { + assert(opConfig->Traversal() == EthosUTraversal::PartKernel); + int divider = (ifmBits == 16) ? 2 : 4; + numKernelSteps = DivRoundUp(subKernelElements, divider); + cycles = std::max(cyclesWb, 4 * numUBlocks.ElementsWH()) * numKernelSteps * numUBlocks.Depth() * + DivRoundUp(ifmBlock.Depth(), 8); + } + + // Calculate delay + int delayCycles = 0; + if ( _arch->IsU55_32() ) + { + int delay = use40BitAcc ? 7 : 3; + if ( numUBlocks.ElementsWH() == 1 ) + { + if ( numUBlocks.Depth() == 1 ) + { + delayCycles = delay * numKernelSteps; + } + else if ( numKernelSteps > 1 ) + { + delayCycles = delay * (numKernelSteps - 1) * numUBlocks.Depth(); + } + } + + if ( (numUBlocks.Width() == 1 || numUBlocks.Height() == 1) && (numUBlocks.Depth() > 1) && use40BitAcc ) + { + delayCycles += delay * numUBlocks.Depth(); + } + } + else + { + int delay = (use40BitAcc && (_arch->_macs <= 128)) ? 3 : 2; + + if ( numUBlocks.ElementsWH() == 1 ) + { + if ( numUBlocks.Depth() == 1 ) + { + delayCycles = delay * numKernelSteps; + } + else if ( numKernelSteps > 1 ) + { + delayCycles = delay * (numKernelSteps - 1) * numUBlocks.Depth(); + } + } + } + + if ( isConvolutionMxN && opConfig->Traversal() == EthosUTraversal::PartKernel ) + { + delayCycles *= DivRoundUp(ifmBlock.Depth(), 8); + } + + cyclesDpuBlk += cycles; + cyclesDpuBlk += delayCycles; + } + } + + if ( npuOp == EthosU55NpuOp::Convolution || npuOp == EthosU55NpuOp::VectorProduct || npuOp == EthosU55NpuOp::ReduceSum ) + { + cyclesDpuBlk *= DivRoundUp(query.ifmShape[0].Depth(), ifmBlock.Depth()); + } + + cyclesDpuBlk /= _arch->_cores; + + // Estimate output cycles + int numOfmBlks = Shape::DivRoundUp(query.ofmShape, ofmBlock).Elements(); + int64_t cyclesOutputBlk = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmBlock.Elements())); + + // Scale and bias tensor + if ( query.constShape.Size() > 0 && query.constShape.Depth() > 0 ) + { + int cyclesBiasBlk = (10 * ofmBlock.Depth() * query.constMemory->ReadLatency() / 256); + cyclesOutputBlk = std::max(cyclesOutputBlk, int64_t(cyclesBiasBlk)); + } + + int64_t cycles_cmd = EstimateMinimumMemoryCycles(query); + cycles_cmd = (cycles_cmd + cyclesOutputBlk + cyclesDpuBlk) / 4; // Per DPU + + cyclesDpuBlk = std::max(cyclesDpuBlk, cycles_cmd); + cyclesOutputBlk = std::max(cyclesOutputBlk, cycles_cmd); + + int64_t totalCycles = 0; + if ( cyclesDpuBlk > cyclesOutputBlk ) + { + totalCycles = cyclesDpuBlk * numOfmBlks + cyclesOutputBlk; + } + else + { + totalCycles = cyclesOutputBlk * numOfmBlks + cyclesDpuBlk; + } + + return totalCycles; +} + +static int EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format, + int elementBits, Shape block, Shape shape, int toTransfer) +{ + int burstLen = 8; + + if ( format == TensorFormat::NHCWB16 ) + { + int zStride = (shape.Width() * elementBits * 16) / 8; + if ( zStride == block.Depth() ) + { + burstLen = elementBits * block.Depth() * block.Width(); + } + else if ( isRead ) + { + burstLen = 16 * elementBits * block.Width(); + } + else + { + burstLen = 16 * elementBits * block.Width() * cores; + } + } + else if ( format == TensorFormat::NHWC ) + { + int xStride = (shape.Depth() * elementBits) / 8; + if ( isRead ) + { + if ( xStride == block.Depth() ) + { + burstLen = elementBits * block.Depth() * block.Width(); + } + else + { + burstLen = elementBits * block.Depth(); + } + } + else + { + if ( (block.Depth() <= 16) && xStride == block.Depth() ) + { + burstLen = elementBits * block.Depth() * block.Width(); + } + else + { + burstLen = std::min(std::min(64 * 8, 16 * elementBits * cores), block.Depth() * elementBits); + } + } + } + + burstLen = std::min(memory->MaxBurstLength(), burstLen / 8); + assert(burstLen > 0 && "Burst length cannot be zero"); + return (toTransfer * memory->MaxBurstLength()) / burstLen; +} + + +int64_t EthosU55Performance::EstimateMinimumMemoryCycles(const PerformanceQuery &query) +{ + EthosU55OpConfig *opConfig = static_cast(query.config); + + int ifmBits = DataTypeSizeBits(query.ifmType[0]); // All inputs expect same bit width + const int ifmCount = query.ifmShape[1].Elements() > 0 ? int(std::size(query.ifmShape)) : 1; + int64_t cyclesIfm = 0; + for ( int i = 0; i < ifmCount; i++ ) + { + // Input block HW transfer (only for elements present) + int ifmBytes = Shape::Min(query.ifmShape[i], opConfig->IfmBlock()).Elements() * ifmBits / 8; + int64_t cyclesIfmBlk = query.ifmMemory[i]->ReadLatency(); + int64_t tx = EstimateMemoryTransfer(_arch->_cores, true, query.ifmMemory[i], query.ifmFormat[i], ifmBits, + opConfig->IfmBlock(), query.ifmShape[i], ifmBytes); + cyclesIfmBlk += int64_t(float(tx) / query.ifmMemory[i]->Bandwidth()); + + cyclesIfm = std::max(cyclesIfm, cyclesIfmBlk); + } + + // Output block HW transfer (only for elements present) + int ofmBits = DataTypeSizeBits(query.ofmType); + int ofmBytes = Shape::Min(query.ofmShape, opConfig->OfmBlock()).Elements() * ofmBits / 8; + int64_t cyclesOfm = query.ofmMemory->WriteLatency(); + int64_t tx = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat, ofmBits, + opConfig->OfmBlock(), query.ofmShape, ofmBytes); + cyclesOfm += int64_t(float(tx) / query.ofmMemory->Bandwidth()); + + return cyclesIfm + cyclesOfm; +} + + +float EthosU55Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused) +{ + EthosU55OpConfig *opConfig = static_cast(query.config); + auto npuOp = _arch->GetHWOp(query.type); + assert(npuOp != EthosU55NpuOp::None); + int ifmBits = DataTypeSizeBits(query.ifmType[0]); + int ofmBits = DataTypeSizeBits(query.ofmType); + int outputPerfIndex = 0; + + if ( (npuOp == EthosU55NpuOp::Elementwise) && (ifmBits == 32) ) + { + // Unary op else Binary op + outputPerfIndex = query.ifmShape[1].Elements() > 0 ? 1 : 0; + } + else if ( query.type == OpType::Mul && ofmBits == 32 ) + { + outputPerfIndex = 2; + } + else if ( (query.type == OpType::Mul) || ((npuOp != EthosU55NpuOp::Elementwise) && opConfig->Acc() == EthosU55SHRamElements::SHRAM_Acc40) ) + { + outputPerfIndex = 3; + } + else if ( query.type == OpType::Add || query.type == OpType::Sub ) + { + if ( false ) + { + // Simple Add/Sub + outputPerfIndex = 4; + } + else + { + // Advanced Add/Sub TODO: Add as perf selection as operator variant + outputPerfIndex = 5; + } + } + else if ( query.type == OpType::MaxPool ) + { + outputPerfIndex = 6; + } + else + { + outputPerfIndex = 7; + } + + int activationPerfIndex = 0; + assert(fused.size() <= 1 && "multiple op performance not available"); + for ( const FusionQuery &fusedOp : fused ) + { + if ( fusedOp.type == OpType::Sigmoid || fusedOp.type == OpType::Tanh || fusedOp.type == OpType::LookupTable ) + { + activationPerfIndex = 0; + } + else if ( fusedOp.type == OpType::Relu || fusedOp.type == OpType::Relu6 || fusedOp.type == OpType::ReluN1To1 ) + { + activationPerfIndex = 1; + } + else + { + activationPerfIndex = 2; + } + } + + float cyclesPerElement = std::max(_perfInfo->outputCycles[outputPerfIndex], _perfInfo->activationCycles[activationPerfIndex]); + + if ( npuOp == EthosU55NpuOp::Elementwise ) + { + int numElemsBlk = opConfig->OfmBlock().Elements(); + assert(numElemsBlk > 0); + float cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f; // per DPU + cyclesPerElement = std::max(cyclesPerElement, cycleCmd); + } + + return cyclesPerElement; +} + +ElementAccess EthosU55Performance::MeasureElementAccess(const PerformanceQuery &query) +{ + ElementAccess access; + EthosU55OpConfig *opConfig = static_cast(query.config); + auto npuOp = _arch->GetHWOp(query.type); + assert(npuOp != EthosU55NpuOp::None); + + Shape ifmBlock = Shape::Min(query.ifmShape[0], opConfig->IfmBlock()); + Shape ofmBlock = Shape::Min(query.ofmShape, opConfig->OfmBlock()); + + Shape ifmRounding = _arch->GetStorageRounding(query.ifmFormat[0]); + Shape ofmRounding = _arch->GetStorageRounding(query.ofmFormat); + + // Number of ofm blocks in the overall output shape + Shape ofmBlocks = Shape::DivRoundUp(query.ofmShape, ofmBlock); + + int ofmBlockDepth = ofmBlock.Depth(); + if ( npuOp == EthosU55NpuOp::Depthwise || npuOp == EthosU55NpuOp::Pooling ) + { + ofmBlocks = ofmBlocks.WithDepth(1); + ofmBlockDepth = query.ifmShape[0].Depth(); + } + + // Convolution & pooling + if ( OpUsesMacs(npuOp) ) + { + // Number of sub kernels + int subKernelWidth = s_SubkernelLimits[int(npuOp)].x; + int subKernelHeight = s_SubkernelLimits[int(npuOp)].y; + int subkernels = DivRoundUp(query.kernel->Size().x, subKernelWidth) * DivRoundUp(query.kernel->Size().y, subKernelHeight); + + int ifmFetch = + (Shape::RoundAway(ifmBlock, ifmRounding).ElementsWH() * Shape::RoundAway(query.ifmShape[0], ifmRounding).Depth()); + + int kernelRead = query.kernel->Size().AreaXY(); + if ( (npuOp != EthosU55NpuOp::Depthwise) && (npuOp != EthosU55NpuOp::Pooling) ) + { + kernelRead *= query.ifmShape[0].Depth(); + } + + int ofmBlockCount = ofmBlocks.Elements(); + + access.ifmRead[0] = ifmFetch * subkernels * ofmBlockCount; + + if ( (npuOp != EthosU55NpuOp::Pooling) && (npuOp != EthosU55NpuOp::ReduceSum) ) + { + int weightFetch = kernelRead * ofmBlockDepth * ofmBlockCount; + access.constRead[0] = weightFetch; + access.constRead[1] = query.ofmShape.Depth(); // Scales & biases + access.weightsRefetch = ofmBlocks.ElementsWH(); + } + } + else if ( npuOp == EthosU55NpuOp::Elementwise ) + { + // IFM1 is scalar + if ( query.ifmShape[0].Elements() == 1 ) + { + if ( DataTypeSizeBits(query.ifmType[0]) > 8 ) // IFM1 is a non 8-bit scalar + { + access.ifmRead[0] = Shape::RoundAway(query.ifmShape[0], ifmRounding).Elements(); + } + else if ( query.ifmShape[1].Elements() > 0 ) + { + access.ifmRead[1] = Shape::RoundAway(query.ofmShape, ifmRounding).Elements(); + } + } + else // IFM1 is not scalar + { + access.ifmRead[0] = Shape::RoundAway(query.ofmShape, ifmRounding).Elements(); + if ( query.ifmShape[1].Elements() > 0 ) + { + // IFM2 is not scalar + if ( query.ifmShape[1].Elements() > 1 ) + { + access.ifmRead[1] = access.ifmRead[0]; + } + else if ( DataTypeSizeBits(query.ifmType[1]) > 8 ) // IFM2 is a non 8-bit scalar + { + access.ifmRead[1] = Shape::RoundAway(query.ifmShape[1], ifmRounding).Elements(); + } + } + } + } + else + { + assert(false); + } + + access.ofmWrite = Shape::RoundAway(query.ofmShape, ofmRounding).Elements(); + + return access; +} + + +ElementAccess EthosU55Performance::ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) +{ + EthosU55OpConfig *opConfig = static_cast(query.config); + + ElementAccess result = access; + + // IFM bytes transferred + int ifmBits = DataTypeSizeBits(query.ifmType[0]); // All inputs expect same bit width + const int ifmCount = query.ifmShape[1].Elements() > 0 ? int(std::size(query.ifmShape)) : 1; + for ( int i = 0; i < ifmCount; i++ ) + { + result.ifmRead[i] = EstimateMemoryTransfer(_arch->_cores, true, query.ifmMemory[i], query.ifmFormat[i], ifmBits, + opConfig->IfmBlock(), query.ifmShape[i], access.ifmRead[i]); + } + + // OFM bytes transferred + result.ofmWrite = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat, + DataTypeSizeBits(query.ofmType), opConfig->OfmBlock(), query.ofmShape, access.ofmWrite); + + // These requires compression ratio information + result.constRead[0] = 0; + result.constRead[1] = 0; + + return result; +} + + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp new file mode 100644 index 00000000..c39e2c40 --- /dev/null +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_performance.hpp @@ -0,0 +1,60 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "architecture/architecture.hpp" + +namespace regor +{ + +class ArchEthosU55; + +struct EthosU55PerfInfo +{ + float outputCycles[8]; + float activationCycles[3]; +}; + +/// +/// Profiles performance analysis for Ethos-U55 +/// +class EthosU55Performance : public ArchitecturePerformance +{ +protected: + ArchEthosU55 *_arch; + const EthosU55PerfInfo *_perfInfo; + +public: + EthosU55Performance(ArchEthosU55 *arch, const EthosU55PerfInfo *perfInfo); + +public: + CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) override; + int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) override; + ElementAccess MeasureElementAccess(const PerformanceQuery &query) override; + ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override; + +private: + int64_t EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused); + float EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused); + int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query); +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp new file mode 100644 index 00000000..092e40f5 --- /dev/null +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.cpp @@ -0,0 +1,1469 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u55_register_cs_generator.hpp" + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "architecture/ethos_u_scaling.hpp" +#include "common/data_type.hpp" +#include "compiler/high_level_command_stream.hpp" +#include "compiler/op_type.hpp" +#include "ethos_u55.hpp" +#include "ethos_u55_scaling.hpp" +#define NPU_DISASSEMBLE +#define NPU_NAMESPACE ethosu65 +#include "architecture/ethosu65/ethos_u65_interface.hpp" + +#include +#include +#include +#include + +namespace regor +{ +using namespace ethosu65; + +void Emitter::Emit(uint32_t instr) +{ + uint16_t cmd = instr & 0xFFFF; + assert(IsCmd0(cmd)); + bool emit = IsOp(cmd) || SetRegister(cmd, instr); + if ( emit ) + { + _stream.push_back(instr); + } +} + +void Emitter::Emit(uint64_t instr) +{ + uint16_t cmd = instr & 0xFFFF; + assert(IsCmd1(cmd)); + bool emit = IsOp(cmd) || SetRegister(cmd, instr); + if ( emit ) + { + _stream.push_back(uint32_t(instr)); + _stream.push_back(uint32_t(instr >> 32)); + } +} + +void Emitter::Clear() +{ + _stream.clear(); + _registers.clear(); +} + + +bool Emitter::SetRegister(uint16_t reg, uint64_t value) +{ + auto item = _registers.find(reg); + bool isChanged = item == _registers.end() || item->second != value; + if ( isChanged ) + { + _registers[reg] = value; + } + return isChanged; +} + +bool Emitter::IsCmd0(uint16_t key) +{ + return (key >> 14) == uint16_t(ethosu65::cmd_ctrl::CMD0_CTRL); +} + +bool Emitter::IsCmd1(uint16_t key) +{ + return (key >> 14) == uint16_t(ethosu65::cmd_ctrl::CMD1_CTRL); +} + +bool Emitter::IsOp(uint16_t key) +{ + return IsCmd0(key) ? (key & (1 << 8)) == 0 : (key & (1 << 8)) != 0; +} + + +/// +/// Generates register command streams for Ethos U55 and Ethos U65. +/// + +namespace +{ +const std::unordered_map s_ElementwiseMap = { + {OpType::Add, elementwise_mode::ADD}, + {OpType::Sub, elementwise_mode::SUB}, + {OpType::Abs, elementwise_mode::ABS}, + {OpType::Mul, elementwise_mode::MUL}, + {OpType::Minimum, elementwise_mode::MIN}, + {OpType::Maximum, elementwise_mode::MAX}, + {OpType::LeakyRelu, elementwise_mode::LRELU}, + {OpType::CLZ, elementwise_mode::CLZ}, + {OpType::SHL, elementwise_mode::SHL}, + {OpType::Asr, elementwise_mode::SHR}, +}; + +activation_type ToActivationType(DataType type) +{ + if ( IsSignedInteger(type) ) + { + return activation_type::SIGNED; + } + else + { + assert(IsInteger(type)); + return activation_type::UNSIGNED; + } +} + +activation_format ToActivationFormat(TensorFormat format) +{ + if ( format == TensorFormat::NHCWB16 ) + { + return activation_format::NHCWB16; + } + else + { + assert(format == TensorFormat::NHWC); + return activation_format::NHWC; + } +} + +activation_precision ToActivationPrecision(DataType type) +{ + switch ( DataTypeSizeBits(type) ) + { + case 8: + return activation_precision::B8; + case 16: + return activation_precision::B16; + case 32: + return activation_precision::B32; + case 64: + return activation_precision::B64; + default: + assert(false); + return activation_precision::B64; + } +} + +ifm_upscale_mode ToIfmUpscaleMode(ArchResampling resampling) +{ + + if ( resampling == ArchResampling::Nearest ) + { + return ifm_upscale_mode::NEAREST; + } + if ( resampling == ArchResampling::Zeros ) + { + return ifm_upscale_mode::ZEROS; + } + return ifm_upscale_mode::NONE; +} + +RCSRoundMode GetRoundingMode(const HLCOperation *op) +{ + switch ( op->rounding ) + { + case HLCRoundMode::NATURAL: + return RCSRoundMode::NATURAL; + case HLCRoundMode::TRUNCATE: + return RCSRoundMode::TRUNCATE; + case HLCRoundMode::DBL: + return RCSRoundMode::DBL; + case HLCRoundMode::AUTO: + return RCSRoundMode::DBL; + default: + return RCSRoundMode::DBL; + } +} + +ifm_scale_mode MapRcsIfmScaleModeToInterface(RCSIfmScaleMode rcsScaleMode) +{ + switch ( rcsScaleMode ) + { + case RCSIfmScaleMode::OPA_OPB_16: + return ifm_scale_mode::OPA_OPB_16; + case RCSIfmScaleMode::OPA_32: + return ifm_scale_mode::OPA_32; + case RCSIfmScaleMode::OPB_32: + return ifm_scale_mode::OPB_32; + default: + assert(0 && "Unexpected value, has the interface changed?"); + return ifm_scale_mode::OPA_32; + } +} + +round_mode MapRcsRoundModeToInterface(RCSRoundMode rcsRoundMode) +{ + switch ( rcsRoundMode ) + { + case RCSRoundMode::NATURAL: + return round_mode::NATURAL; + case RCSRoundMode::TRUNCATE: + return round_mode::TRUNCATE; + case RCSRoundMode::DBL: + return round_mode::DBL; + default: + assert(0 && "Unexpected value, has the interface changed?"); + return round_mode::DBL; + } +} + +} // namespace + +uint32_t EthosU55RCSGenerator::IdRegister() +{ + return id_r{}; +} + +bool EthosU55RCSGenerator::IsSupportedElementwise(const OpType opType) +{ + return s_ElementwiseMap.count(opType) != 0; +} + +EthosU55RCSGenerator::EthosU55RCSGenerator(ArchEthosU55 *arch) : _arch(arch) +{ +} + + +void EthosU55RCSGenerator::Emit(uint32_t instr) +{ + _emit.Emit(instr); +} + +void EthosU55RCSGenerator::Emit(uint64_t instr) +{ + _emit.Emit(instr); +} + +int EthosU55RCSGenerator::GetDoubleBufferOffset(HLCWeights *weights, int rangeIndex) +{ + int doubleBufferOffset = 0; + if ( weights->buffering == Buffering::Double ) + { + assert(weights->subStreams > 0); + int depthIndex = rangeIndex / weights->subStreams; + if ( depthIndex % 2 == 1 ) + { + doubleBufferOffset = weights->maxRangeBytes; + } + } + return doubleBufferOffset; +} + +void EthosU55RCSGenerator::CheckAddressRange(ArchitectureMemory *memory, Address address, int size) +{ + assert(address >= 0); + if ( address >= memory->SizeBytes() ) + { + LOG_ERROR("Error: Address out of bounds, address {0}, memory '{1}' with size {2}\n", address, memory->Name(), + memory->SizeBytes()); + // TODO: replace assert by error handling + assert(false && "Address out of bounds"); + } + assert(size >= 0); + if ( address + size > memory->SizeBytes() ) + { + LOG_ERROR("Error: Address offset out of bounds, address {0}, offset {1}, memory '{2}' with size {3}\n", address, + size, memory->Name(), memory->SizeBytes()); + // TODO: replace assert by error handling + assert(false && "address offset out of bounds"); + } +} + +void EthosU55RCSGenerator::CheckAddresses(const HLCFeatureMap &fm) +{ + CheckAddressRange(fm.memArea.memory, fm.address, fm.AllocationSizeBytes()); + assert(fm.address % 16 == 0 || fm.format != TensorFormat::NHCWB16); +} + +// Calculates the rolling buffer address of the given coordinate. +Address EthosU55RCSGenerator::AddressForCoordinate(const HLCFeatureMap &fm, const Shape &strides, const Shape &coord) +{ + Shape truncatedCoord = Shape::PadAxes(coord, 4, 0) % Shape::PadAxes(fm.shape, 4, 1); + int offset = 0; + if ( fm.format == TensorFormat::NHWC ) + { + offset = strides.Dot(truncatedCoord); + } + else if ( fm.format == TensorFormat::NHCWB16 ) + { + constexpr int BRICK = 16; + int elemSize = DataTypeSizeBits(fm.dataType) / 8; + int strideX = BRICK * elemSize; + offset = + truncatedCoord.Height() * strides.Height() + truncatedCoord.Width() * strideX + + (truncatedCoord.Depth() / BRICK) * strides.Depth() + (truncatedCoord.Depth() % BRICK) * elemSize + + truncatedCoord.Batch() * strides.Batch(); + } + else + { + assert(false); + } + return fm.address + offset; +} + +// Calculates tile sizes/addresses of a feature map +TileBox EthosU55RCSGenerator::GetTiles(const HLCFeatureMap &fm, const Shape &strides, const Box &area) +{ + int crossingY = RoundAway(area.Start().Height() + 1, fm.shape.Height()); + crossingY = std::min(crossingY, area.End().Height()); + int crossingX = RoundAway(area.Start().Width() + 1, fm.shape.Width()); + crossingX = std::min(crossingX, area.End().Width()); + TileBox tiles; + auto height = crossingY - area.Start().Height(); + auto width = crossingX - area.Start().Width(); + tiles.height0 = (height + fm.stepXY.y - 1) / fm.stepXY.y; + tiles.height1 = tiles.height0; + tiles.width0 = (width + fm.stepXY.x - 1) / fm.stepXY.x; + for ( int i = 0; i < 4; ++i ) + { + tiles.address[i] = 0; + } + int fmSize = fm.AllocationSizeBytes(); + tiles.address[0] = AddressForCoordinate(fm, strides, area.Start()); + assert(fm.address <= tiles.address[0] && tiles.address[0] < fm.address + fmSize); + if ( area.End().Width() > crossingX ) + { + tiles.address[1] = AddressForCoordinate(fm, strides, area.Start().WithWidth(crossingX)); + assert(fm.address <= tiles.address[1] && tiles.address[1] < fm.address + fmSize); + assert(false && "Striping in vertical direction is not supported"); + } + if ( area.End().Height() > crossingY ) + { + tiles.address[2] = AddressForCoordinate(fm, strides, area.Start().WithHeight(crossingY)); + assert(fm.address <= tiles.address[2] && tiles.address[2] < fm.address + fmSize); + } + if ( area.End().Width() > crossingX && area.End().Height() > crossingY ) + { + tiles.address[3] = AddressForCoordinate(fm, strides, area.Start().WithWidth(crossingX).WithHeight(crossingY)); + assert(fm.address <= tiles.address[3] && tiles.address[3] < fm.address + fmSize); + } + if ( fm.format == TensorFormat::NHCWB16 ) + { + for ( int i = 0; i < 4; ++i ) + { + assert(tiles.address[i] % 16 == 0 && "NHCWB16 base address is not 16-byte aligned"); + } + } + return tiles; +} + +MemoryAccess EthosU55RCSGenerator::ToMemoryAccess(const HLCFeatureMap &fm, const Box &area, AccessDirection direction) +{ + const auto &strides = fm.strides; + Address start = AddressForCoordinate(fm, strides, area.Start()); + // Note: due to truncating of shape, AddressForCoordinate(fm, .., fm.shape) returns + // fm.address; the - Shape(1, 1, 1) prevents this + Address end = AddressForCoordinate(fm, strides, area.End() - Shape(1, 1, 1)) + DataTypeSizeBits(fm.dataType) / 8; + if ( end < start ) + { + // Area wraps around the end of the feature map + start = fm.address; + end = fm.address + fm.AllocationSizeBytes(); + } + return MemoryAccess(direction, fm.memArea, start, end); +} + +// Returns region number used in NPU_SET_..._REGION +uint32_t EthosU55RCSGenerator::ToRegion(const MemArea &memArea) +{ + auto region = BasePointerIndex::WeightTensor; + if ( memArea == _arch->FeatureMapMemory() ) + { + region = BasePointerIndex::ScratchTensor; + } + else if ( memArea == _arch->StagingMemory() ) + { + region = BasePointerIndex::ScratchFastTensor; + } + else if ( memArea == _arch->LUTMemory() ) + { + region = BasePointerIndex::Mem2Mem; + } + else + { + assert(memArea == _arch->ReadonlyMemory()); + } + return uint32_t(region); +} + +bool EthosU55RCSGenerator::UseZeroPoint0(OpType opType, const HLCFeatureMap &fm, bool isOFM) +{ + if ( fm.quantization.forceZeroPoint ) + { + return false; + } + if ( fm.quantization.zeroPoints.empty() || (fm.dataType == DataType::Int32 && !isOFM) ) + { + return true; + } + return opType == OpType::AvgPool || opType == OpType::ResizeBilinear || opType == OpType::CLZ || opType == OpType::SHL; +} + +// Checks if the feature map is a scalar, and if so, returns the +// quantized value in scalarValue. +bool EthosU55RCSGenerator::IsScalar(const HLCFeatureMap &fm, int32_t &scalarValue) +{ + const auto &view = fm.bufferView; + // A 1-sized feature map in constant memory is a scalar + bool isScalar = fm.shape.Elements() == 1 && view.HasBuffer(); + if ( isScalar ) + { + if ( fm.dataType == DataType::Int8 ) + { + scalarValue = view.Values()[0]; + } + else if ( fm.dataType == DataType::UInt8 ) + { + scalarValue = view.Values()[0]; + } + else if ( fm.dataType == DataType::Int16 ) + { + scalarValue = view.Values()[0]; + } + else + { // Unsupported scalar value + isScalar = false; + } + } + return isScalar; +} + +// Calculates waits for KERNEL_WAIT/DMA_WAIT, returns -1 if no wait is needed +// - opAccesses contains the memory accesses for the current operation +// - outstanding contains the memory accesses for ongoing "other" operations +// (DMA operations if the current op is an NPU operation, NPU operations if the current op is a DMA operation) +// Note: NPU - NPU dependency is handled via block dependency +int EthosU55RCSGenerator::CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque &outstanding) +{ + int waits = 0; + for ( int index = int(outstanding.size()) - 1; index >= 0; ++waits, --index ) + { + for ( const auto &access : opAccesses ) + { + for ( const auto &outstandingAccess : outstanding[index] ) + { + if ( access.Conflicts(outstandingAccess) ) + { + // Current op needs to wait, and after it has waited, + // outstanding[0..index] are not outstanding any longer + for ( int i = 0; i <= index; ++i ) + { + outstanding.pop_front(); + } + return waits; + } + } + } + } + return -1; +} + +// Returns LUT slot to be used for the given LUT operation. +// Sets alreadyInLutMem to true if the LUT is already in SHRAM. +int EthosU55RCSGenerator::AllocateLutSlot( + std::vector &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem) +{ + alreadyInLutMem = false; + int totalSlots = int(lutSlots.size()); + if ( sizeInSlots < 0 || sizeInSlots > totalSlots ) + { + assert(false); + return 0; + } + // Returns least recently used slot, unless the LUT is already in memory + int allocatedSlot = 0; + for ( int i = 0; i < totalSlots; i += sizeInSlots ) + { + if ( lutSlots[i].hlcOp == op ) + { + // LUT is already in SHRAM + allocatedSlot = i; + alreadyInLutMem = true; + break; + } + if ( lutSlots[i].lastUsed < lutSlots[allocatedSlot].lastUsed ) + { + allocatedSlot = i; + } + } + for ( int j = allocatedSlot; j < allocatedSlot + sizeInSlots; ++j ) + { + lutSlots[j].hlcOp = op; + lutSlots[j].lastUsed = timestamp; + } + return allocatedSlot; +} + +//---------------------------------------------------------------------- +// Print +//---------------------------------------------------------------------- + +int EthosU55RCSGenerator::Disassemble(const uint32_t *in, std::string &op, std::vector> &fields) +{ + return isa::disassemble(in, op, fields); +} + +//---------------------------------------------------------------------- +// Scaling (OFM/OPA/OPB_SCALE) +//---------------------------------------------------------------------- + +// Generates OFM_SCALE register for pooling operations +void EthosU55RCSGenerator::GenerateOFMScalingForPooling(HLCOperation *poolOp) +{ + QuantizedScale quant(1, 0); + bool isNoOp = _arch->UseAvgPoolNop(poolOp->type); + ethosU55Scaling::RescalePooling(poolOp, isNoOp); + + if ( poolOp->ofm.quantization && !poolOp->ofm.quantization.scales.empty() ) + { + quant = poolOp->ofm.quantization.scales[0]; + assert(unsigned(quant.shift) < 64); + } + + Emit(isa::npu_set_ofm_scale_t(uint32_t(quant.shift), quant.scale)); +} + +// Generates OFM/OPA/OPB_SCALE registers for elementwise operators. +// Returns the operator to scale +RCSIfmScaleMode EthosU55RCSGenerator::GenerateScalingForElementwise(HLCOperation *op, int ifm0Index) +{ + auto opToScale = RCSIfmScaleMode::OPA_OPB_16; + auto opType = op->type; + + QuantizedScale ofmScale(1, 0); + ethosU55Scaling::RescaleElementwise(op); + int ifmCnt = int(op->ifm.size()); + bool allHaveScale = + !op->ofm.quantization.scales.empty() && !op->ifm[0].quantization.scales.empty() && ifmCnt == 2 && + !op->ifm[1].quantization.scales.empty(); + + if ( opType == OpType::Mul || opType == OpType::Abs ) + { + if ( !op->ofm.quantization.scales.empty() ) + { + ofmScale = op->ofm.quantization.scales[0]; + } + } + else if ( opType == OpType::LeakyRelu ) + { + const HLCParameters *params = &op->parameters; + double alpha = params->leaky_relu.alpha; + ofmScale = QuantizedScale(alpha); + } + else if ( opType == OpType::Add || opType == OpType::Sub ) + { + uint32_t opaScale = 1; + uint32_t opbScale = 1; + uint32_t opaShift = 0; + if ( allHaveScale ) + { + ofmScale = op->ofm.quantization.scales[0]; + QuantizedScale ifm1Scale = op->ifm[ifm0Index].quantization.scales[0]; + QuantizedScale ifm2Scale = op->ifm[1 - ifm0Index].quantization.scales[0]; + opaScale = ifm1Scale.scale; + opaShift = ifm1Scale.shift; + opbScale = ifm2Scale.scale; + + if ( ifm1Scale.scale == 0 || ifm2Scale.scale == 0 ) + { + opbScale = 0; + if ( ifm1Scale.scale == 0 ) + { + opToScale = RCSIfmScaleMode::OPB_32; + opaScale = ifm2Scale.scale; + opaShift = ifm2Scale.shift; + } + else + { + opToScale = RCSIfmScaleMode::OPA_32; + } + } + if ( ifm0Index == 1 ) + { + // Reversed operands + if ( opToScale == RCSIfmScaleMode::OPA_32 ) + { + opToScale = RCSIfmScaleMode::OPB_32; + } + else if ( opToScale == RCSIfmScaleMode::OPB_32 ) + { + opToScale = RCSIfmScaleMode::OPA_32; + } + } + } + assert(opaShift < 64); + Emit(isa::npu_set_opa_scale_t(opaShift, opaScale)); + Emit(isa::npu_set_opb_scale_t(opbScale)); + } + assert(unsigned(ofmScale.shift) < 64); + Emit(isa::npu_set_ofm_scale_t(ofmScale.shift, ofmScale.scale)); + return opToScale; +} + +//---------------------------------------------------------------------- +// BLOCKDEP calculation +//---------------------------------------------------------------------- + +static Shape CalcIFMJobShape(const Shape &ofmBlock, Kernel *kernel, int ifmBlockDepth) +{ + // TODO MLBEDSW-8498: Consider ifm_upscale_mode for job-shape calculations + Point2i dilatedSize = kernel->DilatedWH(); + int h = RequiredInputSize(ofmBlock.Height(), kernel->Stride().y, dilatedSize.y, 1); + int w = RequiredInputSize(ofmBlock.Width(), kernel->Stride().x, dilatedSize.x, 1); + return Shape(1, h, w, ifmBlockDepth); +} + +// Given the area and block size, adds the first/last jobs (depending on fromStart) to jobs. +// - area: total amount of work to perform +// - jobShape: size of each job +// - fromStart: if true, the first jobs are added, if false, the last jobs are added +// (in that case, the very last job is added last) +void EthosU55RCSGenerator::GetJobs(const Box &area, const Shape &jobShape, int nrJobsToGet, bool fromStart, std::vector &jobs) +{ + Shape jobSplit = Shape::DivRoundUp(area.End() - area.Start(), jobShape); + int z = jobSplit.Depth(); + int w = jobSplit.Width(); + int h = jobSplit.Height(); + int n = z * w * h; // n = total number of jobs for the whole area + const auto &start = area.Start().Extract(-3, -2, -1); + const auto &end = area.End().Extract(-3, -2, -1); + int firstJob = fromStart ? 0 : std::max(0, n - nrJobsToGet); + int lastJob = fromStart ? std::min(n, nrJobsToGet) : n; + for ( int i = firstJob; i < lastJob; ++i ) + { + Shape from = Shape(start.Height() + (i / (z * w)) * jobShape.Height(), + start.Width() + ((i / z) % w) * jobShape.Width(), start.Depth() + (i % z) * jobShape.Depth()); + jobs.emplace_back(from, Shape::Min(from + jobShape, end)); + } +} + +// Calculates the value for the BLOCKDEP register +int EthosU55RCSGenerator::CalcBlockDep(HLCStripe *prevStripe, HLCStripe *stripe) +{ + if ( prevStripe == nullptr ) + { + return 0; + } + const auto &op = stripe->operation; + const auto &prevOp = prevStripe->operation; + const auto &prevOfm = prevOp->ofm; + if ( _arch->_shram.reservedEndBanks == 0 ) + { + // SHRAM has no reserved LUT banks + if ( _stripeToLutSlot.count(prevStripe) && !_stripeToLutSlot.count(stripe) ) + { + // Previous operation uses LUT, current does not + return 0; // Prevents corruption of the LUT + } + } + + int ifmIndex = (op->ifm.size() > 1 && op->ifm[1].address == prevOfm.address && op->ifm[1].memArea == prevOfm.memArea) ? 1 : 0; + const auto &ifm = op->ifm[ifmIndex]; + int maxJobs = _arch->MaxBlockdep(); + if ( ifm.address != prevOfm.address || ifm.memArea != prevOfm.memArea ) + { + for ( const auto &fm : op->ifm ) + { + if ( fm.memArea == prevOfm.memArea && + Overlaps(fm.address, fm.address + fm.AllocationSizeBytes(), prevOfm.address, prevOfm.address + prevOfm.AllocationSizeBytes()) ) + { + // Previous OFM overlaps in unexpected way with current IFM + assert(false && "Unexpected overlap previous OFM/current IFM"); + return 0; + } + } + // Previous operation does not produce current operation's IFM + return maxJobs; + } + if ( op->ifm.size() > 1 && ifm.AllocationSizeBytes() < op->ifm[1 - ifmIndex].AllocationSizeBytes() ) + { + // Prev OFM produces IFM2 which is broadcasted (this should be rare) + return 0; + } + if ( prevOfm.shape != ifm.shape ) + { + // OFM has been reshaped; the job overlap calculations below do not work in this case + return 0; + } + // Previous operation produces current operations IFM + auto prevConfig = static_cast(prevOp->config); + Shape prevBlock = prevConfig->OfmBlock(); + auto config = static_cast(op->config); + Shape currBlock = CalcIFMJobShape(config->OfmBlock(), &op->kernel, config->IfmBlock().Depth()); + // Get the last few jobs from the previous operation (each job produces a part of the current op's IFM) + std::vector lastPrevJobs; + GetJobs(prevStripe->ofmArea, prevBlock, maxJobs, false, lastPrevJobs); + // Get the first few jobs from the current operation (each job consumes a part of the current op's IFM) + std::vector firstCurrJobs; + GetJobs(stripe->ifmAreas[ifmIndex], currBlock, maxJobs, true, firstCurrJobs); + // Find the highest block dependency such that there is no overlap between + // any job from the previous op with any job from the current op during block dependency jobs + int sz = int(std::min(lastPrevJobs.size(), firstCurrJobs.size())); + int prevLastIx = int(lastPrevJobs.size()) - 1; + for ( int blockdep = 0; blockdep < sz; ++blockdep ) + { + bool overlaps = false; + for ( int i = 0; !overlaps && i <= blockdep; ++i ) + { + for ( int j = blockdep - i; !overlaps && i + j <= blockdep; ++j ) + { + if ( firstCurrJobs[i].Overlaps(lastPrevJobs[prevLastIx - j]) ) + { + overlaps = true; + } + } + } + if ( overlaps ) + { + return blockdep; + } + } + // No overlap found + return sz; +} + +//---------------------------------------------------------------------- +// Register generation +//---------------------------------------------------------------------- + +void EthosU55RCSGenerator::GeneratePadding(const HLCPadding &padding) +{ + Emit(isa::npu_set_ifm_pad_top_t(padding.top)); + Emit(isa::npu_set_ifm_pad_left_t(padding.left)); + Emit(isa::npu_set_ifm_pad_bottom_t(padding.bottom)); + Emit(isa::npu_set_ifm_pad_right_t(padding.right)); +} + +// Generates ACTIVATION registers +void EthosU55RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + const HLCOperation *op = stripe->operation.get(); + assert(op->subOps.size() <= 1); + OpType opType = OpType::None; + if ( IsActivation(op->type) ) + { + // Non-fused activation + opType = op->type; + assert(op->subOps.empty() || opType == op->subOps[0].type); + } + else if ( !op->subOps.empty() ) + { + // Fused activation + opType = op->subOps[0].type; + } + auto &ofm = op->ofm; + int size = std::min(16, DataTypeSizeBits(ofm.dataType)); + assert(size > 0 && "Illegal data type"); + bool isSigned = bool(ofm.dataType & DataType::Signed); + int64_t quantizedMin = isSigned ? -(1LL << (size - 1)) : 0; + int64_t quantizedMax = isSigned ? (1LL << (size - 1)) - 1 : (1LL << size) - 1; + + auto act = activation_function::RELU; + auto clipRange = activation_clip_range::OFM_PRECISION; + if ( ofm.quantization.quantMin.size() ) + { + quantizedMin = std::max(quantizedMin, ofm.quantization.quantMin[0]); + } + if ( ofm.quantization.quantMax.size() ) + { + quantizedMax = std::min(quantizedMax, ofm.quantization.quantMax[0]); + } + + if ( opType == OpType::Sigmoid ) + { + act = activation_function::SIGMOID; + } + else if ( opType == OpType::Tanh ) + { + act = activation_function::TANH; + } + else if ( opType == OpType::LUT ) + { + auto ¶m = op->subOps[0].parameters.lut; + size = param.sizeBytes; + assert(size == 256 || size == 1024 || size == 2048); + + int tableIndex = 0; + auto pos = _stripeToLutSlot.find(stripe); + if ( pos != _stripeToLutSlot.end() ) + { + tableIndex = pos->second; + } + else + { + assert(false && "Command uses lut, but no lut info found"); + } + act = activation_function(int(activation_function::TABLE_0) + tableIndex); + if ( ofm.dataType == DataType::Int32 ) + { + // force INT8 range + clipRange = activation_clip_range::FORCE_INT8; + quantizedMin = std::max(quantizedMin, -128); + quantizedMax = std::min(quantizedMax, 127); + } + auto &layout = static_cast(op->config)->_layout; + Address lutStart = Address(layout.lutStart) * _arch->_shram.bankSizeBytes + tableIndex * _arch->_shram.lutSlotSize; + memoryAccesses.emplace_back(AccessDirection::Read, _arch->LUTMemory(), lutStart, lutStart + param.sizeBytes); + } + assert(quantizedMin <= std::numeric_limits::max()); + assert(quantizedMax <= std::numeric_limits::max()); + Emit(isa::npu_set_activation_t(act, clipRange)); + Emit(isa::npu_set_activation_min_t(uint32_t(quantizedMin))); + Emit(isa::npu_set_activation_max_t(uint32_t(quantizedMax))); +} + +// Generates KERNEL related registers +void EthosU55RCSGenerator::GenerateKernel(const Kernel &kernel, bool partKernel) +{ + auto dilatedWH = kernel.DilatedWH(); + Emit(isa::npu_set_kernel_height_m1_t(dilatedWH.y - 1)); + Emit(isa::npu_set_kernel_width_m1_t(dilatedWH.x - 1)); + uint32_t stride_x_lsb = (kernel.Stride().x - 1) & 1; + uint32_t stride_y_lsb = (kernel.Stride().y - 1) & 1; + uint32_t stride_x_msb = ((kernel.Stride().x - 1) >> 1) & 1; + uint32_t stride_y_msb = ((kernel.Stride().y - 1) >> 1) & 1; + auto weightOrder = partKernel ? weight_order::PART_KERNEL_FIRST : weight_order::DEPTH_FIRST; + kernel_dilation dilation_x = kernel_dilation(kernel.Dilation().x - 1); + kernel_dilation dilation_y = kernel_dilation(kernel.Dilation().y - 1); + kernel_decomposition decomposition = kernel_decomposition::D8X8; // Kernel decomposition + Emit(isa::npu_set_kernel_stride_t( + stride_x_lsb, stride_y_lsb, weightOrder, dilation_x, dilation_y, decomposition, stride_x_msb, stride_y_msb)); +} + +// Generates IFM2_BROADCAST register for binary elementwise operations +void EthosU55RCSGenerator::GenerateIFM2Broadcast(const Shape &ifmShape, const Shape &ifm2Shape, bool reversedOperands, bool isScalar) +{ + auto broadcastH = broadcast_mode::DISABLE; + auto broadcastW = broadcast_mode::DISABLE; + auto broadcastC = broadcast_mode::DISABLE; + auto order = reversedOperands ? ifm2_operand_order::ORDER_A : ifm2_operand_order::ORDER_B; + auto isConstant = broadcast_mode::DISABLE; + if ( isScalar ) + { + isConstant = broadcast_mode::ENABLE; + } + else + { + if ( ifmShape.Height() != ifm2Shape.Height() ) + { + // Broadcast in 'H' dimension + broadcastH = broadcast_mode::ENABLE; + assert(ifm2Shape.Height() == 1); + } + if ( ifmShape.Width() != ifm2Shape.Width() ) + { + // Broadcast in 'W' dimension + broadcastW = broadcast_mode::ENABLE; + assert(ifm2Shape.Width() == 1); + } + if ( ifmShape.Depth() != ifm2Shape.Depth() ) + { + // Broadcast in 'C' dimension + broadcastC = broadcast_mode::ENABLE; + assert(ifm2Shape.Depth() == 1); + } + } + Emit(isa::npu_set_ifm2_broadcast_t(broadcastH, broadcastW, broadcastC, order, isConstant)); +} + +// Generates IFM_PRECISION register +void EthosU55RCSGenerator::GenerateIFMPrecision(const HLCFeatureMap &fm, RCSIfmScaleMode scaleMode) +{ + activation_type type = ToActivationType(fm.dataType); + activation_precision precision = ToActivationPrecision(fm.dataType); + activation_format format = ToActivationFormat(fm.format); + round_mode roundMode = round_mode::DBL; + ifm_scale_mode interfaceScaleMode = MapRcsIfmScaleModeToInterface(scaleMode); + Emit(isa::npu_set_ifm_precision_t(type, precision, format, interfaceScaleMode, roundMode)); +} + +// Generates IFM2_PRECISION register +void EthosU55RCSGenerator::GenerateIFM2Precision(const HLCFeatureMap &fm) +{ + activation_type type = ToActivationType(fm.dataType); + activation_precision precision = ToActivationPrecision(fm.dataType); + activation_format format = ToActivationFormat(fm.format); + Emit(isa::npu_set_ifm2_precision_t(type, precision, format)); +} + +// Generates OFM_PRECISION register +void EthosU55RCSGenerator::GenerateOFMPrecision(const HLCFeatureMap &fm, bool useGlobalScale, RCSRoundMode roundMode) +{ + activation_type type = ToActivationType(fm.dataType); + activation_precision precision = ToActivationPrecision(fm.dataType); + activation_format format = ToActivationFormat(fm.format); + round_mode interfaceRoundMode = MapRcsRoundModeToInterface(roundMode); + auto scaleMode = useGlobalScale ? ofm_scale_mode::GLOBAL : ofm_scale_mode::PER_CHANNEL; + Emit(isa::npu_set_ofm_precision_t(type, precision, format, scaleMode, interfaceRoundMode)); +} + +// Generates common IFM registers +void EthosU55RCSGenerator::GenerateIFM(OpType opType, const HLCFeatureMap &fm, const Box &inputArea) +{ + CheckAddresses(fm); + Emit(isa::npu_set_ifm_region_t(ToRegion(fm.memArea))); + Shape strides = fm.strides; + auto tiles = GetTiles(fm, strides, inputArea); + auto boxSize = inputArea.SizeShape(); + // IFM_BASE registers + Emit(isa::npu_set_ifm_base0_t(tiles.address[0])); + Emit(isa::npu_set_ifm_base1_t(tiles.address[1])); + Emit(isa::npu_set_ifm_base2_t(tiles.address[2])); + Emit(isa::npu_set_ifm_base3_t(tiles.address[3])); + // Tile related registers + Emit(isa::npu_set_ifm_height0_m1_t(tiles.height0 - 1)); + Emit(isa::npu_set_ifm_height1_m1_t(tiles.height1 - 1)); + Emit(isa::npu_set_ifm_width0_m1_t(tiles.width0 - 1)); + Emit(isa::npu_set_ifm_depth_m1_t(boxSize.Depth() - 1)); + // IFM_STRIDE registers + Emit(isa::npu_set_ifm_stride_y_t(strides.Height() * fm.stepXY.y)); + Emit(isa::npu_set_ifm_stride_x_t(strides.Width() * fm.stepXY.x)); + Emit(isa::npu_set_ifm_stride_c_t(strides.Depth())); + // IFM_ZERO_POINT register + auto &quant = fm.quantization; + uint32_t zp = UseZeroPoint0(opType, fm, false) ? 0 : uint32_t(quant.zeroPoints[0]); + Emit(isa::npu_set_ifm_zero_point_t(zp)); +} + +// Generates common IFM2 registers +void EthosU55RCSGenerator::GenerateIFM2(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue) +{ + if ( isScalar ) + { + Emit(isa::npu_set_ifm2_scalar_t(uint32_t(scalarValue))); + } + else + { + CheckAddresses(fm); + Emit(isa::npu_set_ifm2_region_t(ToRegion(fm.memArea))); + Shape strides = fm.strides; + auto tiles = GetTiles(fm, strides, inputArea); + // IFM2_BASE registers + Emit(isa::npu_set_ifm2_base0_t(tiles.address[0])); + Emit(isa::npu_set_ifm2_base1_t(tiles.address[1])); + Emit(isa::npu_set_ifm2_base2_t(tiles.address[2])); + Emit(isa::npu_set_ifm2_base3_t(tiles.address[3])); + // Tile related registers + Emit(isa::npu_set_ifm2_height0_m1_t(tiles.height0 - 1)); + Emit(isa::npu_set_ifm2_height1_m1_t(tiles.height1 - 1)); + Emit(isa::npu_set_ifm2_width0_m1_t(tiles.width0 - 1)); + // IFM2_STRIDE registers + Emit(isa::npu_set_ifm2_stride_y_t(strides.Height() * fm.stepXY.y)); + Emit(isa::npu_set_ifm2_stride_x_t(strides.Width() * fm.stepXY.x)); + Emit(isa::npu_set_ifm2_stride_c_t(strides.Depth())); + } + // IFM2_ZERO_POINT register + auto &quant = fm.quantization; + uint32_t zp = UseZeroPoint0(opType, fm, false) ? 0 : uint32_t(quant.zeroPoints[0]); + Emit(isa::npu_set_ifm2_zero_point_t(zp)); +} + +// Generates OFM registers +void EthosU55RCSGenerator::GenerateOFM(OpType opType, const HLCFeatureMap &fm, const Box &outputArea) +{ + CheckAddresses(fm); + Emit(isa::npu_set_ofm_region_t(ToRegion(fm.memArea))); + Shape strides = fm.strides; + auto tiles = GetTiles(fm, strides, outputArea); + auto boxSize = outputArea.SizeShape(); + // OFM_BASE registers + Emit(isa::npu_set_ofm_base0_t(tiles.address[0])); + Emit(isa::npu_set_ofm_base1_t(tiles.address[1])); + Emit(isa::npu_set_ofm_base2_t(tiles.address[2])); + Emit(isa::npu_set_ofm_base3_t(tiles.address[3])); + // OFM size + Emit(isa::npu_set_ofm_height_m1_t(DivRoundUp(boxSize.Height(), fm.stepXY.y) - 1)); + Emit(isa::npu_set_ofm_width_m1_t(DivRoundUp(boxSize.Width(), fm.stepXY.x) - 1)); + // Tile related registers + Emit(isa::npu_set_ofm_height0_m1_t(tiles.height0 - 1)); + Emit(isa::npu_set_ofm_height1_m1_t(tiles.height1 - 1)); + Emit(isa::npu_set_ofm_width0_m1_t(tiles.width0 - 1)); + Emit(isa::npu_set_ofm_depth_m1_t(boxSize.Depth() - 1)); + // OFM_STRIDE registers + Emit(isa::npu_set_ofm_stride_y_t(strides.Height() * fm.stepXY.y)); + Emit(isa::npu_set_ofm_stride_x_t(strides.Width() * fm.stepXY.x)); + Emit(isa::npu_set_ofm_stride_c_t(strides.Depth())); + // OFM_ZERO_POINT register + auto &quant = fm.quantization; + uint32_t zp = UseZeroPoint0(opType, fm, true) ? 0 : uint32_t(quant.zeroPoints[0]); + Emit(isa::npu_set_ofm_zero_point_t(zp)); +} + +// Generates WEIGHT registers +void EthosU55RCSGenerator::GenerateWeights(const HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto weights = stripe->operation->weights.get(); + if ( weights == nullptr ) + { + return; + } + int depth = stripe->weightRangeDepth; + Emit(isa::npu_set_weight_region_t(ToRegion(weights->memArea))); + auto item0 = weights->encodedRanges.find(WeightKey(0, depth)); + assert(item0 != weights->encodedRanges.end()); + auto &range0 = item0->second; + int doubleBufferOffset = GetDoubleBufferOffset(weights, range0.index); + Address address = weights->address + range0.weightOffset + doubleBufferOffset; + int length = RoundAway(range0.weightBytes, 16); + CheckAddressRange(weights->memArea.memory, address, length); + Emit(isa::npu_set_weight_base_t(address)); + Emit(isa::npu_set_weight_length_t(length)); + memoryAccesses.emplace_back(AccessDirection::Read, weights->memArea, address, address + length); + auto item1 = weights->encodedRanges.find(WeightKey(1, depth)); + if ( item1 != weights->encodedRanges.end() ) + { + auto &range1 = item1->second; + Address address1 = weights->address + RoundAway(range0.TotalBytes(), 16) + range1.weightOffset + doubleBufferOffset; + int length1 = RoundAway(range1.weightBytes, 16); + CheckAddressRange(weights->memArea.memory, address1, length1); + Emit(isa::npu_set_weight1_base_t(address1)); + Emit(isa::npu_set_weight1_length_t(length1)); + memoryAccesses.emplace_back(AccessDirection::Read, weights->memArea, address1, address1 + length1); + } + else if ( _arch->_cores > 1 ) + { + Emit(isa::npu_set_weight1_length_t(0)); + } +} + +// Generates SCALE registers +void EthosU55RCSGenerator::GenerateScales(const HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto scales = stripe->operation->scales.get(); + if ( scales == nullptr ) + { + assert(!stripe->operation->weights); + return; + } + int depth = stripe->weightRangeDepth; + Emit(isa::npu_set_scale_region_t(ToRegion(scales->memArea))); + auto item0 = scales->encodedRanges.find(WeightKey(0, depth)); + assert(item0 != scales->encodedRanges.end()); + auto &range0 = item0->second; + int doubleBufferOffset = GetDoubleBufferOffset(scales, range0.index); + Address address = scales->address + doubleBufferOffset; + int length = RoundAway(range0.scaleBytes, 16); + CheckAddressRange(scales->memArea.memory, address, length); + Emit(isa::npu_set_scale_base_t(address)); + Emit(isa::npu_set_scale_length_t(length)); + memoryAccesses.emplace_back(AccessDirection::Read, scales->memArea, address, address + length); + auto item1 = scales->encodedRanges.find(WeightKey(1, depth)); + if ( item1 != scales->encodedRanges.end() ) + { + auto &range1 = item1->second; + Address address1 = address + RoundAway(range0.TotalBytes(), 16); + int length1 = RoundAway(range1.scaleBytes, 16); + CheckAddressRange(scales->memArea.memory, address1, length1); + Emit(isa::npu_set_scale1_base_t(address1)); + Emit(isa::npu_set_scale1_length_t(length1)); + memoryAccesses.emplace_back(AccessDirection::Read, scales->memArea, address1, address1 + length1); + } + else if ( _arch->_cores > 1 ) + { + Emit(isa::npu_set_scale1_length_t(0)); + } +} + +// Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers +void EthosU55RCSGenerator::GenerateBlockConfig(const EthosU55OpConfig *config) +{ + Emit(isa::npu_set_ofm_blk_height_m1_t(config->OfmBlock().Height() - 1)); + Emit(isa::npu_set_ofm_blk_width_m1_t(config->OfmBlock().Width() - 1)); + Emit(isa::npu_set_ofm_blk_depth_m1_t(config->OfmBlock().Depth() - 1)); +} + +// Generates IB_END/IB_START/AB_START/ACC_FORMAT registers +void EthosU55RCSGenerator::GenerateShramRegisters(const EthosU55OpConfig *config, bool hasIfm2) +{ + auto &layout = config->_layout; + Emit(isa::npu_set_ifm_ib_end_t(layout.ibEnd)); + Emit(isa::npu_set_ab_start_t(layout.abStart)); + if ( hasIfm2 ) + { + Emit(isa::npu_set_ifm2_ib_start_t(layout.ibStart2)); + } + // ACC_FORMAT register + auto accType = config->_accumulatorType; + acc_format format; + if ( accType == EthosU55SHRamElements::SHRAM_Acc16 ) + { + format = acc_format::F16; + } + else if ( accType == EthosU55SHRamElements::SHRAM_Acc32 ) + { + format = acc_format::I32; + } + else + { + assert(accType == EthosU55SHRamElements::SHRAM_Acc40); + format = acc_format::I40; + } + Emit(isa::npu_set_acc_format_t(format)); +} + +// Calculates and generates KERNEL_WAIT or DMA_WAIT register +void EthosU55RCSGenerator::GenerateWaits(bool isKernelWait, const MemoryAccesses &memoryAccesses, int maxWaits, + std::deque &outstandingAccesses, std::deque &accessesToUpdate) +{ + int waits = CalcCommandWaits(memoryAccesses, outstandingAccesses); + if ( waits >= 0 ) + { + if ( isKernelWait ) + { + Emit(isa::npu_op_kernel_wait_t(waits)); + } + else + { + Emit(isa::npu_op_dma_wait_t(waits)); + } + } + accessesToUpdate.push_back(memoryAccesses); + if ( int(accessesToUpdate.size()) > maxWaits ) + { + accessesToUpdate.pop_front(); + } +} + +// Inserts DMA commands for copying LUTs from constant memory +// to LUT memory +std::vector> +EthosU55RCSGenerator::InsertLUTDMACommands(std::vector> &cmds) +{ + std::vector> result; + int lutSlotSize = _arch->_shram.lutSlotSize; + int slots = (_arch->_shram.bankSizeBytes * _arch->_shram.lutBanks) / lutSlotSize; + std::vector lutSlots(slots); + int timestamp = 0; + result.reserve(cmds.size()); + for ( auto &hlc : cmds ) + { + ++timestamp; + if ( hlc->IsStripe() ) + { + auto stripe = static_cast(hlc.get()); + auto op = stripe->operation; + auto config = static_cast(op->config); + if ( !op->subOps.empty() && op->subOps[0].type == OpType::LUT ) + { + const auto &srcTens = op->subOps[0].parameters.lut; + assert(config->_layout.lutStart > 0); + assert(srcTens.sizeBytes % lutSlotSize == 0); + bool alreadyInLutMem; + int sizeInSlots = srcTens.sizeBytes / lutSlotSize; + int slot = AllocateLutSlot(lutSlots, op.get(), sizeInSlots, timestamp, alreadyInLutMem); + _stripeToLutSlot[stripe] = slot; + + if ( !alreadyInLutMem ) + { + auto dma = std::make_unique(); + dma->srcMemArea = srcTens.memArea; + dma->srcAddress = srcTens.address; + dma->length = srcTens.sizeBytes; + dma->destMemArea = _arch->LUTMemory(); + dma->destAddress = _arch->_shram.bankSizeBytes * config->_layout.lutStart + slot * lutSlotSize; + result.push_back(std::move(dma)); + } + } + else if ( _arch->_shram.reservedEndBanks == 0 ) + { + // LUT is overwritten by SHRAM accumulator buffers; clear slots + for ( auto &slot : lutSlots ) + { + slot.hlcOp = nullptr; + slot.lastUsed = 0; + } + } + } + result.push_back(std::move(hlc)); + } + return result; +} + +//---------------------------------------------------------------------- +// Operations +//---------------------------------------------------------------------- + +// Generates NPU_OP_* command +void EthosU55RCSGenerator::GenerateOperationCode(OpType opType) +{ + if ( IsPooling(opType) ) + { + pooling_mode mode; + if ( opType == OpType::AvgPool || opType == OpType::ResizeBilinear ) + { + mode = pooling_mode::AVERAGE; + } + else if ( opType == OpType::MaxPool ) + { + mode = pooling_mode::MAX; + } + else + { + assert(opType == OpType::ReduceSum); + mode = pooling_mode::REDUCE_SUM; + } + Emit(isa::npu_op_pool_t(mode)); + } + else if ( IsDepthwise(opType) ) + { + Emit(isa::npu_op_depthwise_t()); + } + else if ( IsConvolution(opType) || IsVectorProduct(opType) ) + { + Emit(isa::npu_op_conv_t()); + } + else if ( IsElementwise(opType) ) + { + const auto &item = s_ElementwiseMap.find(opType); + if ( item == s_ElementwiseMap.end() ) + { + assert(false && "Unsupported elementwise operator"); + } + else + { + Emit(isa::npu_op_elementwise_t(item->second)); + } + } + else if ( _arch->UseAvgPoolNop(opType) ) + { + // Implemented using AvgPool + Emit(isa::npu_op_pool_t(pooling_mode::AVERAGE)); + } + else + { + assert(false && "Unsupported operator"); + } +} + +void EthosU55RCSGenerator::GenerateCommon(const HLCStripe *stripe, bool useGlobalScale, RCSIfmScaleMode opToScale, + MemoryAccesses &memoryAccesses, int ifm0Index) +{ + auto op = stripe->operation.get(); + GenerateIFM(op->type, op->ifm[ifm0Index], stripe->ifmAreas[ifm0Index]); + memoryAccesses.push_back(ToMemoryAccess(op->ifm[ifm0Index], stripe->ifmAreas[ifm0Index], AccessDirection::Read)); + GenerateIFMPrecision(op->ifm[ifm0Index], opToScale); + ifm_upscale_mode upscaleMode = ToIfmUpscaleMode(op->ifm[0].resamplingMode); + Emit(isa::npu_set_ifm_upscale_t(upscaleMode)); + if ( !IsElementwise(op->type) ) + { + GeneratePadding(stripe->padding); + } + GenerateOFM(op->type, op->ofm, stripe->ofmArea); + memoryAccesses.push_back(ToMemoryAccess(op->ofm, stripe->ofmArea, AccessDirection::Write)); + RCSRoundMode roundMode = GetRoundingMode(op); + GenerateOFMPrecision(op->ofm, useGlobalScale, roundMode); + EthosU55OpConfig *config = static_cast(stripe->operation->config); + if ( !IsElementwise(op->type) ) + { + GenerateKernel(op->kernel, config->Traversal() == EthosUTraversal::PartKernel); + } + GenerateWeights(stripe, memoryAccesses); + GenerateScales(stripe, memoryAccesses); + GenerateActivation(stripe, memoryAccesses); + if ( _arch->_shram.reservedEndBanks == 0 ) + { + // SHRAM has no reserved LUT banks; LUT is overwritten by accumulator buffer + memoryAccesses.emplace_back( + AccessDirection::Write, _arch->LUTMemory(), 0, _arch->_shram.bankSizeBytes * _arch->_shram.totalBanks); + } +} + +// Conv2D/Depthwise operations +void EthosU55RCSGenerator::GenerateConvolutionOp(const HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + GenerateCommon(stripe, false, RCSIfmScaleMode::OPA_OPB_16, memoryAccesses); +} + +// MaxPool/AvgPool/ResizeBilinear or operations that are mapped to AvgPool +void EthosU55RCSGenerator::GeneratePoolingOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto op = stripe->operation.get(); + auto pad = stripe->padding; + auto padSum = pad.top + pad.left + pad.bottom + pad.right; + bool useGlobalScale = op->type != OpType::MaxPool && padSum == 0; + + if ( _arch->UseAvgPoolNop(op->type) ) + { + assert(op->kernel.Size() == Point2i(1, 1)); + assert(op->kernel.Stride() == Point2i(1, 1)); + assert(op->kernel.Dilation() == Point2i(1, 1)); + assert(op->kernel.DepthMultiplier() == 1); + assert(useGlobalScale); + } + GenerateCommon(stripe, useGlobalScale, RCSIfmScaleMode::OPA_OPB_16, memoryAccesses); + if ( useGlobalScale ) + { + GenerateOFMScalingForPooling(op); + } +} + +// Elementwise operations +void EthosU55RCSGenerator::GenerateElementwiseOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto op = stripe->operation.get(); + auto opType = op->type; + bool useGlobalScale = opType == OpType::Add || opType == OpType::Sub || opType == OpType::Mul || opType == OpType::LeakyRelu || opType == OpType::Abs; + if ( IsUnaryElementwise(opType) ) + { + assert(op->ifm.size() == 1); + auto opToScale = GenerateScalingForElementwise(op, 0); + GenerateCommon(stripe, useGlobalScale, opToScale, memoryAccesses); + } + else + { + // Binary operation: generate IFM2 registers + assert(op->ifm.size() == 2); + assert(stripe->ifmAreas.size() == 2); + int32_t scalarValue = 0; + auto ifmShape = stripe->ifmAreas[0].SizeShape(); + auto ifm2Shape = stripe->ifmAreas[1].SizeShape(); + bool reversedOperands = IsScalar(op->ifm[0], scalarValue) || (ifmShape != ifm2Shape && ifmShape.IsSubShapeOf(ifm2Shape)); + int ifmIndex = 0; + if ( reversedOperands ) + { + // If reversed, the scalar/broadcasted feature map has to be the ifm2 tensor, + // so switch ifm/ifm2 + ifmIndex = 1; + std::swap(ifmShape, ifm2Shape); + } + auto opToScale = GenerateScalingForElementwise(op, ifmIndex); + GenerateCommon(stripe, useGlobalScale, opToScale, memoryAccesses, ifmIndex); + int ifm2Index = 1 - ifmIndex; + bool isScalar = IsScalar(op->ifm[ifm2Index], scalarValue); + GenerateIFM2(opType, op->ifm[ifm2Index], stripe->ifmAreas[ifm2Index], isScalar, scalarValue); + if ( !isScalar ) + { + memoryAccesses.push_back(ToMemoryAccess(op->ifm[ifm2Index], stripe->ifmAreas[ifm2Index], AccessDirection::Read)); + } + GenerateIFM2Precision(op->ifm[ifm2Index]); + GenerateIFM2Broadcast(ifmShape, ifm2Shape, reversedOperands, isScalar); + } +} + +bool EthosU55RCSGenerator::GenerateStripe(HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto opType = stripe->operation->type; + EthosU55NpuOp npuOp = ArchEthosU55::GetHWOp(opType); + if ( npuOp == EthosU55NpuOp::Pooling || npuOp == EthosU55NpuOp::ReduceSum ) + { + GeneratePoolingOp(stripe, memoryAccesses); + } + else if ( npuOp == EthosU55NpuOp::Depthwise || npuOp == EthosU55NpuOp::Convolution || npuOp == EthosU55NpuOp::VectorProduct ) + { + GenerateConvolutionOp(stripe, memoryAccesses); + } + else if ( npuOp == EthosU55NpuOp::Elementwise ) + { + GenerateElementwiseOp(stripe, memoryAccesses); + } + else + { + LOG_ERROR("Register command stream generator: unsupported operator '{}'\n", OpTypeToString(opType)); + assert(false); + return false; + } + EthosU55OpConfig *config = static_cast(stripe->operation->config); + GenerateBlockConfig(config); + GenerateShramRegisters(config, stripe->operation->ifm.size() >= 2); + return true; +} + +// Generates register commands for DMA operations +void EthosU55RCSGenerator::GenerateDMA(const HLCDMA *dma, MemoryAccesses &memoryAccesses) +{ + auto srcRegionMode = dma_region_mode::EXTERNAL; + auto destRegionMode = dma_region_mode::EXTERNAL; + if ( dma->destMemArea == _arch->LUTMemory() ) + { + destRegionMode = dma_region_mode::INTERNAL; + } + auto strideMode = dma_stride_mode::D1; + CheckAddressRange(dma->srcMemArea.memory, dma->srcAddress, dma->length); + CheckAddressRange(dma->destMemArea.memory, dma->destAddress, dma->length); + Emit(isa::npu_set_dma0_src_region_t(ToRegion(dma->srcMemArea), srcRegionMode, strideMode)); + Emit(isa::npu_set_dma0_src_t(dma->srcAddress)); + Emit(isa::npu_set_dma0_dst_region_t(ToRegion(dma->destMemArea), destRegionMode, strideMode)); + Emit(isa::npu_set_dma0_dst_t(dma->destAddress)); + Emit(isa::npu_set_dma0_len_t(dma->length)); + memoryAccesses.emplace_back(AccessDirection::Read, dma->srcMemArea, dma->srcAddress, dma->srcAddress + dma->length); + memoryAccesses.emplace_back(AccessDirection::Write, dma->destMemArea, dma->destAddress, dma->destAddress + dma->length); +} + +std::vector EthosU55RCSGenerator::GenerateCommandStream(std::vector> &highLevelCommandStream, + std::vector> *cmdRanges, bool verbose) +{ + _emit.Clear(); + _stripeToLutSlot.clear(); + GenerateInitialRegisterSetup(); + auto cmds = InsertLUTDMACommands(highLevelCommandStream); + std::deque outstandingDmaAccesses; + std::deque outstandingNpuAccesses; + int maxOutstandingDMAOps = _arch->MaxOutstandingDMAOps(); + int maxOutstandingKernelOps = _arch->MaxOutstandingKernelOps(); + HLCStripe *prevOp = nullptr; + std::vector> debugInfo; + for ( auto &hlc : cmds ) + { + MemoryAccesses memoryAccesses; + int emitStart = _emit.Position(); + if ( hlc->IsStripe() ) + { + auto stripe = static_cast(hlc.get()); + if ( verbose ) + { + debugInfo.emplace_back(emitStart, stripe->operation->ToString()); + } + if ( !GenerateStripe(stripe, memoryAccesses) ) + { + return std::vector(); + } + // BLOCKDEP register + int blockdep = CalcBlockDep(prevOp, stripe); + Emit(isa::npu_set_blockdep_t(blockdep)); + GenerateWaits(false, memoryAccesses, maxOutstandingKernelOps, outstandingDmaAccesses, outstandingNpuAccesses); + GenerateOperationCode(stripe->operation->type); + prevOp = stripe; + // Return command mapping information to the caller + int emitEnd = _emit.Position(); + if ( cmdRanges ) + { + cmdRanges->emplace_back(stripe->operation->_srcKey, emitStart, emitEnd); + } + } + else + { + auto dma = static_cast(hlc.get()); + if ( verbose ) + { + debugInfo.emplace_back(emitStart, dma->ToString()); + } + GenerateDMA(static_cast(hlc.get()), memoryAccesses); + GenerateWaits(true, memoryAccesses, maxOutstandingDMAOps, outstandingNpuAccesses, outstandingDmaAccesses); + Emit(isa::npu_op_dma_start_t()); + } + } + Emit(isa::npu_op_stop_t(0xFFFF)); + if ( verbose ) + { + PrintCommandStream(_emit.CommandStream(), debugInfo); + } + return _emit.CommandStream(); +} + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp new file mode 100644 index 00000000..16abcf27 --- /dev/null +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_register_cs_generator.hpp @@ -0,0 +1,269 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "architecture/ethos_u_register_cs_generator.hpp" +#include "architecture/ethos_u_scaling.hpp" +#include "common/data_type.hpp" +#include "compiler/high_level_command_stream.hpp" +#include "compiler/op_type.hpp" +#include "ethos_u55.hpp" + +#include +#include +#include +#include + +namespace regor +{ +class Emitter +{ +public: + Emitter() = default; + void Emit(uint32_t instr); + void Emit(uint64_t instr); + void Clear(); + int Position() const { return int(_stream.size()); } + const std::vector &CommandStream() const { return _stream; } + +private: + bool SetRegister(uint16_t reg, uint64_t value); + static bool IsCmd0(uint16_t key); + static bool IsCmd1(uint16_t key); + static bool IsOp(uint16_t key); + std::vector _stream; + std::unordered_map _registers; +}; + + +// Specifies the addresses and dimensions of the tiles of a feature map. +// A feature map can use 1 to 4 tiles +struct TileBox +{ + int height0; // The height of tile 0 + int height1; // The height of tile 1 + int width0; // The width of tile 0, and tile 2 (if used) + Address address[4]; // Tile addresses +}; + +enum BasePointerIndex +{ + WeightTensor = 0, // base address index for the Weight tensor + ScratchTensor = 1, // base address index for the Scratch_tensor in the TensorArena + ScratchFastTensor = 2, // base address for the Scratch_fast_tensor + Mem2Mem = 3, // base address slot for memory to memory transfer +}; + +enum class AccessDirection +{ + Read = 0, + Write = 1, +}; + +enum class RCSIfmScaleMode : uint8_t +{ + OPA_OPB_16 = 0, + OPA_32 = 1, + OPB_32 = 2, +}; + +enum class RCSRoundMode : uint8_t +{ + DBL = 0, + TRUNCATE = 1, + NATURAL = 2, +}; + +struct MemoryAccess +{ + AccessDirection direction; + MemArea memArea; + Address start; + Address end; + + MemoryAccess(AccessDirection direction_, MemArea area_, Address start_, Address end_) : + direction(direction_), memArea(area_), start(start_), end(end_) + { + } + + bool Conflicts(const MemoryAccess &other) const + { + bool overlaps = Overlaps(start, end, other.start, other.end) && memArea == other.memArea; + return overlaps && (direction != AccessDirection::Read || other.direction != AccessDirection::Read); + } +}; + +using MemoryAccesses = std::vector; + +struct LutSlot +{ + const HLCOperation *hlcOp = nullptr; + int lastUsed = 0; +}; + +/// +/// Generates register command streams for Ethos U55 and Ethos U65. +/// +class EthosU55RCSGenerator : public EthosURegisterCSGenerator +{ +public: + EthosU55RCSGenerator(ArchEthosU55 *arch); + + //---------------------------------------------------------------------- + // Print + //---------------------------------------------------------------------- + + int Disassemble(const uint32_t *in, std::string &op, std::vector> &fields); + +protected: + //---------------------------------------------------------------------- + // Helper functions + //---------------------------------------------------------------------- + void Emit(uint32_t instr); + void Emit(uint64_t instr); + + static int GetDoubleBufferOffset(HLCWeights *weights, int rangeIndex); + static void CheckAddressRange(ArchitectureMemory *memory, Address address, int size); + static void CheckAddresses(const HLCFeatureMap &fm); + // Calculates the rolling buffer address of the given coordinate. + static Address AddressForCoordinate(const HLCFeatureMap &fm, const Shape &strides, const Shape &coord); + // Calculates tile sizes/addresses of a feature map + static TileBox GetTiles(const HLCFeatureMap &fm, const Shape &strides, const Box &area); + MemoryAccess ToMemoryAccess(const HLCFeatureMap &fm, const Box &area, AccessDirection direction); + // Returns region number used in NPU_SET_..._REGION + uint32_t ToRegion(const MemArea &memArea); + static bool UseZeroPoint0(OpType opType, const HLCFeatureMap &fm, bool isOFM); + // Checks if the feature map is a scalar, and if so, returns the + // quantized value in scalarValue. + static bool IsScalar(const HLCFeatureMap &fm, int32_t &scalarValue); + // Calculates waits for KERNEL_WAIT/DMA_WAIT, returns -1 if no wait is needed + // - opAccesses contains the memory accesses for the current operation + // - outstanding contains the memory accesses for ongoing "other" operations + // (DMA operations if the current op is an NPU operation, NPU operations if the current op is a DMA operation) + // Note: NPU - NPU dependency is handled via blockdep + static int CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque &outstanding); + // Returns LUT slot to be used for the given LUT operation. + // Sets alreadyInLutMem to true if the LUT is already in SHRAM. + int AllocateLutSlot(std::vector &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem); + //---------------------------------------------------------------------- + // Scaling (OFM/OPA/OPB_SCALE) + //---------------------------------------------------------------------- + + // Generates OFM_SCALE register for pooling operations + void GenerateOFMScalingForPooling(HLCOperation *poolOp); + // Generates OFM/OPA/OPB_SCALE registers for elementwise operators. + // Returns the operator to scale + RCSIfmScaleMode GenerateScalingForElementwise(HLCOperation *op, int ifm0Index); + + + + //---------------------------------------------------------------------- + // BLOCKDEP calculation + //---------------------------------------------------------------------- + + // Given the area and block size, adds the first/last jobs (depending on fromStart) to jobs. + // - area: total amount of work to perform + // - block: size of each job + // - fromStart: if true, the first jobs are added, if false, the last jobs are added + // (in that case, the very last job is added last) + void GetJobs(const Box &area, const Shape &block, int nrJobsToGet, bool fromStart, std::vector &jobs); + // Calculates the value for the BLOCKDEP register + int CalcBlockDep(HLCStripe *prevStripe, HLCStripe *stripe); + + + + //---------------------------------------------------------------------- + // Register generation + //---------------------------------------------------------------------- + + void GeneratePadding(const HLCPadding &padding); + // Generates ACTIVATION registers + void GenerateActivation(const HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Generates KERNEL related registers + void GenerateKernel(const Kernel &kernel, bool partKernel); + // Generates IFM2_BROADCAST register for binary elementwise operations + void GenerateIFM2Broadcast(const Shape &ifmShape, const Shape &ifm2Shape, bool reversedOperands, bool isScalar); + // Generates IFM_PRECISION register + void GenerateIFMPrecision(const HLCFeatureMap &fm, RCSIfmScaleMode scaleMode); + // Generates IFM2_PRECISION register + void GenerateIFM2Precision(const HLCFeatureMap &fm); + // Generates OFM_PRECISION register + void GenerateOFMPrecision(const HLCFeatureMap &fm, bool useGlobalScale, RCSRoundMode roundMode); + // Generates common IFM registers + void GenerateIFM(OpType opType, const HLCFeatureMap &fm, const Box &inputArea); + // Generates common IFM2 registers + void GenerateIFM2(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue); + // Generates OFM registers + void GenerateOFM(OpType opType, const HLCFeatureMap &fm, const Box &outputArea); + // Generates WEIGHT registers + void GenerateWeights(const HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Generates SCALE registers + void GenerateScales(const HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers + void GenerateBlockConfig(const EthosU55OpConfig *config); + // Generates IB_END/IB_START/AB_START/ACC_FORMAT registers + void GenerateShramRegisters(const EthosU55OpConfig *config, bool hasIfm2); + // Calculates and generates KERNEL_WAIT or DMA_WAIT register + void GenerateWaits(bool isKernelWait, const MemoryAccesses &memoryAccesses, int maxWaits, + std::deque &outstandingAccesses, std::deque &accessesToUpdate); + // Inserts DMA commands for copying LUTs from constant memory + // to LUT memory + std::vector> InsertLUTDMACommands(std::vector> &cmds); + + //---------------------------------------------------------------------- + // Operations + //---------------------------------------------------------------------- + + // Generates NPU_OP_* command + void GenerateOperationCode(OpType opType); + void GenerateCommon(const HLCStripe *stripe, bool useGlobalScale, RCSIfmScaleMode opToScale, + MemoryAccesses &memoryAccesses, int ifm0Index = 0); + // Conv2D/Depthwise operations + void GenerateConvolutionOp(const HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // MaxPool/AvgPool/ResizeBilinear or operations that are mapped to AvgPool + void GeneratePoolingOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Elementwise operations + void GenerateElementwiseOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses); + bool GenerateStripe(HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Generates register commands for DMA operations + void GenerateDMA(const HLCDMA *dma, MemoryAccesses &memoryAccesses); + + virtual void GenerateInitialRegisterSetup() + { + // No special initial setup for Ethos U55 + } + +public: + std::vector GenerateCommandStream(std::vector> &highLevelCommandStream, + std::vector> *cmdRanges, bool verbose) override; + + static uint32_t IdRegister(); + static bool IsSupportedElementwise(const OpType opType); + +private: + ArchEthosU55 *_arch; + // For stripes that use LUT: the LUT slot to be used + std::unordered_map _stripeToLutSlot; + Emitter _emit; +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp new file mode 100644 index 00000000..6d40d4bb --- /dev/null +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.cpp @@ -0,0 +1,325 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u55_scaling.hpp" + +#include "architecture/ethos_u_scaling.hpp" +#include "compiler/high_level_command_stream.hpp" +#include "compiler/op_type.hpp" +#include "compiler/quantization.hpp" + +namespace regor::ethosU55Scaling +{ +namespace +{ +void AdvancedElementwiseAddSubScale(double input1Scale, double input2Scale, double outputScale, int bitDepth, + QuantizedScale &input1Rescale, QuantizedScale &outScale) +{ + auto maxInputScale = std::max(input1Scale, input2Scale); + auto minInputScale = std::min(input1Scale, input2Scale); + int inputShift = bitDepth == 8 ? 20 : 15; + double ifm1Rescale; + double ifm2Rescale; + SimplifiedElementwiseAddSubScale(minInputScale, maxInputScale, outputScale, inputShift, ifm1Rescale, ifm2Rescale, outScale); + input1Rescale = QuantizedScale(ifm1Rescale); +} + +float GetScale(const Quantization *quant) +{ + if ( quant != nullptr && quant->scales.size() != 0 ) + { + return float(quant->scales[0].Dequantize()); + } + else + { + return 1.0f; + } +} + +} // namespace + +void RescaleElementwise(HLCOperation *op) +{ + int ifmCnt = int(op->ifm.size()); + Quantization *ifm1Quant = &op->ifm[0].quantization; + Quantization *ifm2Quant = ifmCnt == 2 ? &op->ifm[1].quantization : nullptr; + Quantization *ofmQuant = &op->ofm.quantization; + + if ( ifm1Quant->type == QuantizationType::EXPLICIT && ofmQuant->type == QuantizationType::EXPLICIT && + (ifm2Quant == nullptr || ifm2Quant->type == QuantizationType::EXPLICIT) ) + { + return; + } + + QuantizedScale outScale(1, 0); + + double ifm1Scale = GetScale(ifm1Quant); + double ifm2Scale = GetScale(ifm2Quant); + double ofmScale = GetScale(ofmQuant); + + DataType ifmDataType = op->ifm[0].dataType; + OpType opType = op->type; + + bool allHaveScale = + (!ifm1Quant->scales.empty() && !ofmQuant->scales.empty() && ifm2Quant != nullptr && !ifm2Quant->scales.empty()); + + if ( opType == OpType::Mul ) + { + if ( allHaveScale ) + { + outScale = ElementwiseMulScale(ifm1Scale, ifm2Scale, ofmScale); + } + } + else if ( opType == OpType::Abs || opType == OpType::LeakyRelu ) + { + outScale = QuantizedScale(ofmScale); + } + else if ( opType == OpType::Add || opType == OpType::Sub ) + { + int bitDepth = DataTypeSizeBits(ifmDataType); + bool useAdvancedScaling = false; + uint32_t opaScale = 1; + uint32_t opbScale = 1; + int opaShift = 0; + int opbShift = 0; + double ifm1Rescale; + double ifm2Rescale; + if ( allHaveScale ) + { + if ( ifm1Scale == ifm2Scale ) + { + SimplifiedElementwiseAddSubScale(ifm1Scale, ifm2Scale, ofmScale, 16, ifm1Rescale, ifm2Rescale, outScale); + opaScale = uint32_t(round(ifm1Rescale)); + opbScale = uint32_t(round(ifm2Rescale)); + if ( bitDepth == 16 ) + { + // Align the double rounding with that of advanced scaling + opaScale /= 2; + opbScale /= 2; + --outScale.shift; + } + else + { + // For 8 bit we can't guarantee double rounding with simplified scaling will always be + // the same as with advanced scaling due to different shifts. When the ofm scale fulfils + // the following we know that double rounding will have no effect for advanced scaling + // no matter the input, so we can safely use simplified scaling with double rounding disabled. + useAdvancedScaling = (outScale.scale & 0xFFF) != 0; + } + } + else + { + useAdvancedScaling = true; + } + if ( useAdvancedScaling ) + { + // Use advanced implementation only when input/output scales differ, + // or when we can't guarantee the absence of rounding errors + QuantizedScale inScale(1, 0); + AdvancedElementwiseAddSubScale(ifm1Scale, ifm2Scale, ofmScale, bitDepth, inScale, outScale); + if ( ifm1Scale <= ifm2Scale ) + { + opaScale = inScale.scale; + opaShift = inScale.shift; + opbScale = 0; + opbShift = 0; + } + else + { + opaScale = 0; + opaShift = 0; + opbScale = inScale.scale; + opbShift = inScale.shift; + } + } + } + if ( ifm1Quant != nullptr && ifm1Quant->type == QuantizationType::TFLITE ) + { + ifm1Quant->scales.clear(); + ifm1Quant->scales.push_back({int32_t(opaScale), opaShift}); + ifm1Quant->type = QuantizationType::EXPLICIT; + } + if ( ifm2Quant != nullptr && ifm2Quant->type == QuantizationType::TFLITE ) + { + ifm2Quant->scales.clear(); + ifm2Quant->scales.push_back({int32_t(opbScale), opbShift}); + ifm2Quant->type = QuantizationType::EXPLICIT; + } + } + if ( ofmQuant != nullptr && ofmQuant->type == QuantizationType::TFLITE ) + { + ofmQuant->scales.clear(); + ofmQuant->scales.push_back(outScale); + ofmQuant->type = QuantizationType::EXPLICIT; + } +} + +void RescalePooling(HLCOperation *op, bool isNoOp) +{ + + Quantization *ifm1Quant = &op->ifm[0].quantization; + Quantization *ofmQuant = &op->ofm.quantization; + uint32_t scale = 1; + int shift = 0; + DataType ifmDataType = op->ifm[0].dataType; + OpType opType = op->type; + + if ( ofmQuant->type != QuantizationType::TFLITE ) + { + // Explicit scaling + return; + } + + if ( !ifm1Quant->scales.empty() && !ofmQuant->scales.empty() ) + { + double ifmScale = GetScale(ifm1Quant); + double ofmScale = GetScale(ofmQuant); + auto actType = op->subOps.empty() ? opType : op->subOps[0].type; + if ( actType == OpType::Sigmoid || actType == OpType::Tanh ) + { + double rescale = 0x3000 * ifmScale; + if ( ifmDataType == DataType::Int16 ) + { + // Calculate scale and shift for the output scale of 1/(3*4096) + double xLog2 = std::log2(ifmScale); + int roundedLog2 = int(std::round(xLog2)); + bool isPowerOf2 = std::abs(xLog2 - roundedLog2) < 0.001; + shift = roundedLog2 + 12; + if ( isPowerOf2 && ((actType == OpType::Tanh && (shift == 0 || shift == 1)) || (actType == OpType::Sigmoid && (shift == 0))) ) + { + // Special handling if input scale is 1/2048 or 1/4096 + scale = 3 << shift; + shift = 0; + } + else + { + shift = 0; + int maxRescale = 16384; + while ( rescale < maxRescale && shift <= 30 ) + { + shift++; + rescale *= 2; + } + scale = uint32_t(rescale); + } + } + else + { + QuantizePoolingScaleMaxPrecision(op->kernel.ElementsWH(), rescale, scale, shift, 32); + } + } + else if ( opType == OpType::MemoryCopy ) + { + double rescale = ifmScale / ofmScale; + // In case of concat or other memory operation, rescaling might be needed. + // The scale is maximised, to get maximum precision + QuantizePoolingScaleMaxPrecision(op->kernel.ElementsWH(), rescale, scale, shift, 32); + } + else if ( opType == OpType::Quantize ) + { + // Quantize operations need double-precision scaling + QuantizedScale quantScale(ifmScale / ofmScale); + scale = uint32_t(quantScale.scale); + shift = quantScale.shift; + } + else if ( isNoOp ) + { + QuantizedScale quantScale(float(ifmScale) / float(ofmScale)); + scale = uint32_t(quantScale.scale); + shift = quantScale.shift; + } + else + { + // Normal pooling operation, without need for special scaling + double rescale = ifmScale / ofmScale; + QuantizePoolingScale(op->kernel.ElementsWH(), rescale, 0, scale, shift, 32); + } + } + ofmQuant->scales.clear(); + ofmQuant->scales.push_back({int32_t(scale), shift}); + ofmQuant->type = QuantizationType::EXPLICIT; +} + +Quantization RescalePerChannel(const Quantization &ifmQuant, const Quantization &weightQuant, + const Quantization &ofmQuant, const DataType scaleDataType, const DataType ifmDataType) +{ + if ( ofmQuant.type != QuantizationType::TFLITE ) + { + // Explicit quantized scale has already been set + return ofmQuant; + } + + Quantization quantResult; + quantResult.type = QuantizationType::EXPLICIT; + quantResult.zeroPoints = ofmQuant.zeroPoints; + quantResult.quantMin = ofmQuant.quantMin; + quantResult.quantMax = ofmQuant.quantMax; + quantResult.dimension = ofmQuant.dimension; + quantResult.forceZeroPoint = ofmQuant.forceZeroPoint; + + if ( !ifmQuant.scales.empty() && !ofmQuant.scales.empty() && !weightQuant.scales.empty() ) + { + DataType dataType = DataType::None; + bool reducedScale = false; + if ( scaleDataType == DataType::Int32 ) + { + switch ( ifmDataType ) + { + case DataType::Int8: + case DataType::UInt8: + case DataType::Int16: + dataType = ifmDataType; + break; + default: + break; + } + } + else if ( scaleDataType == DataType::Int64 && DataTypeSizeBits(ifmDataType) == 16 ) + { + dataType = DataType::Int16; + reducedScale = true; + } + + int modIfm = (ifmQuant.scales.size()) == 1 ? 0 : -1; + int modOfm = (ofmQuant.scales.size()) == 1 ? 0 : -1; + + quantResult.scales.reserve(weightQuant.scales.size()); + + for ( int i = 0; i < int(weightQuant.scales.size()); i++ ) + { + double v = 1.0; + float ifmScale = float(ifmQuant.scales[i & modIfm].Dequantize()); + float ofmScale = float(ofmQuant.scales[i & modOfm].Dequantize()); + float weightScale = float(weightQuant.scales[i].Dequantize()); + if ( dataType == DataType::UInt8 ) + { + v = double(ifmScale * weightScale) / double(ofmScale); + } + else if ( dataType == DataType::Int8 || dataType == DataType::Int16 ) + { + v = (double(ifmScale) * double(weightScale)) / double(ofmScale); + } + + quantResult.scales.emplace_back(v, reducedScale); + } + } + + return quantResult; +} + +} // namespace regor::ethosU55Scaling diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.hpp new file mode 100644 index 00000000..84ce0894 --- /dev/null +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_scaling.hpp @@ -0,0 +1,39 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/data_type.hpp" + +namespace regor +{ + +class Quantization; +struct HLCOperation; + +namespace ethosU55Scaling +{ + + +void RescalePooling(HLCOperation *op, bool isNoOp); +void RescaleElementwise(HLCOperation *op); +Quantization RescalePerChannel(const Quantization &ifmQuant, const Quantization &weightQuant, + const Quantization &ofmQuant, const DataType scaleDataType, const DataType ifmDataType); + +} // namespace ethosU55Scaling +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.cpp b/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.cpp new file mode 100644 index 00000000..8e1b18f0 --- /dev/null +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.cpp @@ -0,0 +1,487 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u55_weight_encoder.hpp" + +#include "common/logging.hpp" + +#include "architecture/architecture.hpp" +#include "architecture/ethos_u_scaling.hpp" +#include "architecture/mlw_encode.hpp" +#include "common/buffer_view.hpp" +#include "common/shape.hpp" +#include "compiler/tensor_properties.hpp" +#include "ethos_u55.hpp" +#include "ethos_u55_scaling.hpp" + +#include + +namespace regor +{ + + +EthosU55WeightEncoder::EthosUEncodingConfig::EthosUEncodingConfig(int cores) : _cores(cores) +{ +} + +void EthosU55WeightEncoder::EthosUEncodingConfig::Rehash() +{ + _hash = SimpleHash32(ofmBlockDepth, traversal, _depthOffsetHash, ifmType, dilation, ohwiStrides); + + _depthOffsetHash = 0; + for ( int offset : this->depthOffsets ) + { + _depthOffsetHash = _depthOffsetHash * 31 ^ offset; + } +} + +uint32_t EthosU55WeightEncoder::EthosUEncodingConfig::Hash() +{ + return _hash; +} + +bool EthosU55WeightEncoder::EthosUEncodingConfig::Equals(IWeightEncodingConfig *other) +{ + EthosUEncodingConfig *p = static_cast(other); + return std::tie(ofmBlockDepth, traversal, _depthOffsetHash, ifmType, dilation, ohwiStrides) == + std::tie(p->ofmBlockDepth, p->traversal, p->_depthOffsetHash, p->ifmType, p->dilation, ohwiStrides); +} + +const std::vector &EthosU55WeightEncoder::EthosUEncodingConfig::DepthOffsets() +{ + return this->depthOffsets; +} + +Flags EthosU55WeightEncoder::EthosUEncodingConfig::Format() +{ + return WeightFormat::Default; +} + + +std::unique_ptr EthosU55WeightEncoder::GetEncodingConfig(ArchitectureOpConfig *opCfg, + const WeightsRef &weights, const Kernel *kernel, DataType ifmType, const std::vector &depthOffsets, Flags) +{ + std::unique_ptr params = std::make_unique(_arch->_cores); + + EthosU55OpConfig *opConfig = static_cast(opCfg); + params->ofmBlockDepth = opConfig->OfmBlock().Depth(); + params->traversal = opConfig->Traversal(); + params->depthOffsets = depthOffsets; + params->ifmType = ifmType; + params->dilation = kernel->Dilation(); + + assert(!weights.isScales); + Shape ohwiStrides = weights.view->StrideBytes() * 8 / DataTypeSizeBits(weights.type); + if ( weights.axisOrder == AxisOrder::IHWO ) + { + ohwiStrides = ohwiStrides.Extract(3, 1, 2, 0); + } + params->ohwiStrides = std::move(ohwiStrides); + params->ohwiStrides[0] = params->ohwiStrides[0] * _arch->_cores; + params->Rehash(); + + return params; +} + +int EthosU55WeightEncoder::StreamsRequired(IWeightEncodingConfig *, const Shape &weightShape, int &scaleStreamsRequired) +{ + scaleStreamsRequired = std::min(weightShape[0], _arch->_cores); + return scaleStreamsRequired; +} + +static int EncodeBias(int64_t bias, int32_t scale, int shift, uint8_t data[10]) +{ + assert(-(1LL << (40 - 1)) <= bias && bias < (1LL << (40 - 1))); // signed 40-bit range + assert(0 <= shift && shift < (1 << 6)); // unsigned 6-bit range + + data[0] = uint8_t((bias >> (0 * 8)) & 0xFF); + data[1] = uint8_t((bias >> (1 * 8)) & 0xFF); + data[2] = uint8_t((bias >> (2 * 8)) & 0xFF); + data[3] = uint8_t((bias >> (3 * 8)) & 0xFF); + data[4] = uint8_t((bias >> (4 * 8)) & 0xFF); + data[5] = uint8_t((scale >> (0 * 8)) & 0xFF); + data[6] = uint8_t((scale >> (1 * 8)) & 0xFF); + data[7] = uint8_t((scale >> (2 * 8)) & 0xFF); + data[8] = uint8_t((scale >> (3 * 8)) & 0xFF); + data[9] = uint8_t(shift & 0x3F); + return 10; +} + + +template +class EthosUWeightOrdering : public WeightSourceCommon +{ +protected: + // Transform + WeightTransformParam *_param; + WeightTransformFunc _transform; + // Loop Limits + int _ofmBlockDepth; + int _ifmBlockDepth; + short _ofmUBlockDepth; + short _ifmUBlockDepth; + short _decompX; + short _decompY; + short _subKernelRound; + // Saved state + int _ofmBlockZ = 0; + int _ifmBlockZ = 0; + int _subKernelX = 0; + int _subKernelY = 0; + int _ifmUBlockOuter = 0; + int _ifmUBlockInner = 0; + int _ofmUBlockZ = 0; + int _ifmUBlockZ = 0; + int _kernelElement = 0; + int _ofmUBlock = 0; + EthosUTraversal _traversal; + +public: + EthosUWeightOrdering(int cores, const Point2i &dilation, int ofmBlockDepth, int ifmBitDepth, int ofmUBlockDepth, + int ifmUBlockDepth, WeightTransformFunc func, WeightTransformParam *param, EthosUTraversal traversal) + { + _streams = cores; + _ofmBlockDepth = ofmBlockDepth; + _ifmBlockDepth = ((traversal == EthosUTraversal::PartKernel) || (ifmBitDepth == 16)) ? 16 : 32; + _ofmUBlockDepth = short(ofmUBlockDepth); + _ifmUBlockDepth = short(ifmUBlockDepth); + _decompX = short(8 / dilation.x); + _decompY = short(8 / dilation.y); + if ( traversal == EthosUTraversal::Depthwise ) + { + _subKernelRound = 4; + } + else if ( traversal == EthosUTraversal::PartKernel ) + { + _subKernelRound = (ifmBitDepth == 16) ? 2 : 4; + } + else + { + _subKernelRound = 1; + } + _transform = func; + _param = param; + _traversal = traversal; + } + + void SetSource(const void *buffer, int depthOffset, const Shape &ohwiShape, const Shape &ohwiStrides, int streamIndex) override + { + SetSourceCommon(buffer, depthOffset + streamIndex, ohwiShape, ohwiStrides, streamIndex, true); + } + +public: + int Get(int16_t *output, int count) override + { + if ( _traversal == EthosUTraversal::Depthwise ) return GetNext(output, count); + else if ( _traversal == EthosUTraversal::PartKernel ) return GetNext(output, count); + return GetNext(output, count); + } + + template + int GetNext(int16_t *output, int count) + { + if ( _ofmBlockZ >= _ofmDepth ) + { + return 0; + } + + int ofmBlockZ, ifmBlockZ; + int ifmUBlockOuter, ifmUBlockInner; + int ifmUBlockZ, ofmUBlockZ, ofmUBlock; + int subKernelX, subKernelY; + int kernelElement; + int16_t *write = output; + + const TYPE *buffer = reinterpret_cast(_source); + int streamBlockDepth = (_ofmBlockDepth + _streams - 1 - _streamIndex) / _streams; + + for ( ofmBlockZ = _ofmBlockZ; ofmBlockZ < _ofmDepth; ofmBlockZ += streamBlockDepth ) + { + int clippedOfmBlockDepth = std::min(streamBlockDepth, _ofmDepth - ofmBlockZ); + // IFM blocks required for the brick + for ( ifmBlockZ = _ifmBlockZ; ifmBlockZ < (IS_DEPTHWISE ? 1 : _ifmDepth); ifmBlockZ += _ifmBlockDepth ) + { + int clippedIfmBlockDepth; + if ( IS_DEPTHWISE ) + { + clippedIfmBlockDepth = _ifmUBlockDepth; + } + else + { + clippedIfmBlockDepth = IS_PARTKERNEL ? std::min(_ifmBlockDepth, _ifmDepth - ifmBlockZ) : _ifmBlockDepth; + } + + // Weight decomposition + // Subkernel Splitting (H) + for ( subKernelY = _subKernelY; subKernelY < _kernelH; subKernelY += _decompY ) + { + int subHeight = std::min(_kernelH - subKernelY, _decompY); + // Subkernel splitting (W) + for ( subKernelX = _subKernelX; subKernelX < _kernelW; subKernelX += _decompX ) + { + int subWidth = std::min(_kernelW - subKernelX, _decompX); + int subKernelElements = subWidth * subHeight; + + // Part-kernel first works across the kernel H/W and needs padding + subKernelElements = RoundAway(subKernelElements, _subKernelRound); + + int ifmBlockDepthOuter = IS_PARTKERNEL ? clippedIfmBlockDepth : 1; + int ifmBlockDepthInner = IS_PARTKERNEL ? 1 : clippedIfmBlockDepth; + + for ( ifmUBlockOuter = _ifmUBlockOuter; ifmUBlockOuter < ifmBlockDepthOuter; ifmUBlockOuter += _ifmUBlockDepth ) + { + // OFM uBlocks in OFM-block over depth + for ( ofmUBlock = _ofmUBlock; ofmUBlock < clippedOfmBlockDepth; ofmUBlock += _ofmUBlockDepth ) + { + // HW Kernel element traversal - cannot be a H/W loop due to element + // padding requirement on depthwise/part-kernel configurations + for ( kernelElement = _kernelElement; kernelElement < subKernelElements; kernelElement++ ) + { + int kx = kernelElement % subWidth; + int ky = kernelElement / subWidth; + // IFM uBlocks in IFM-block over depth (only 1 uBlock if depthwise) + // In case of part-kernel-first IFM uBlock traversal have already been handled + // and this loop is ignored. + for ( ifmUBlockInner = _ifmUBlockInner; ifmUBlockInner < ifmBlockDepthInner; ifmUBlockInner += _ifmUBlockDepth ) + { + int ifmUBlock = ifmUBlockInner + ifmUBlockOuter; + // Feed OFM uBlock elements + for ( ofmUBlockZ = _ofmUBlockZ; ofmUBlockZ < _ofmUBlockDepth; ofmUBlockZ++ ) + { + // Source IFM uBlock elements (only 1 element deep if depthwise) + for ( ifmUBlockZ = _ifmUBlockZ; ifmUBlockZ < (IS_DEPTHWISE ? 1 : _ifmUBlockDepth); ifmUBlockZ++ ) + { + // Source position within the current subkernel + int wx = subKernelX + kx; + int wy = subKernelY + ky; + // Source IFM/OFM slices + int ifm_z = ifmBlockZ + ifmUBlock + ifmUBlockZ; + int ofm_z = ofmBlockZ + ofmUBlock + ofmUBlockZ; + if ( (ifm_z < _ifmDepth) && (ofm_z < _ofmDepth) && (ky < subHeight) ) + { + _param->o = ofm_z; + _param->h = wy; + _param->w = wx; + _param->i = ifm_z; + int weight = int(buffer[WeightIndex(ofm_z, wy, wx, ifm_z)]); + *write = int16_t(_transform(_param, weight)); + } + else + { + *write = 0; + } + write++; + if ( --count == 0 ) + { + // Save state + _ifmUBlockZ = ifmUBlockZ + 1; + _ofmUBlockZ = ofmUBlockZ; + _ifmUBlockInner = ifmUBlockInner; + _kernelElement = kernelElement; + _ofmUBlock = ofmUBlock; + _ifmUBlockOuter = ifmUBlockOuter; + _subKernelX = subKernelX; + _subKernelY = subKernelY; + _ifmBlockZ = ifmBlockZ; + _ofmBlockZ = ofmBlockZ; + // Return weights generated (less than requested count == EOS) + return int(intptr_t(write - output)); + } + } + _ifmUBlockZ = 0; + } + _ofmUBlockZ = 0; + } + _ifmUBlockInner = 0; + } + _kernelElement = 0; + } + _ofmUBlock = 0; + } + _ifmUBlockOuter = 0; + } + _subKernelX = 0; + } + _subKernelY = 0; + } + _ifmBlockZ = 0; + } + _ofmBlockZ = 0; + return int(intptr_t(write - output)); + } +}; + + +std::unique_ptr EthosU55WeightEncoder::GetWeightSource( + IWeightEncodingConfig *config, DataType weightType, WeightTransformFunc func, WeightTransformParam *param) +{ + int ofmUBlockDepth = _arch->_ofmUBlock.Depth(); + int ifmUBlockDepth = _arch->_ifmUBlock.Depth(); + + EthosUEncodingConfig *cfg = static_cast(config); + int ifmBitDepth = DataTypeSizeBits(cfg->ifmType); + + if ( weightType == DataType::UInt8 ) + { + return std::make_unique>(_arch->_cores, cfg->dilation, cfg->ofmBlockDepth, + ifmBitDepth, ofmUBlockDepth, ifmUBlockDepth, func, param, cfg->traversal); + } + else if ( weightType == DataType::Int8 ) + { + return std::make_unique>(_arch->_cores, cfg->dilation, cfg->ofmBlockDepth, + ifmBitDepth, ofmUBlockDepth, ifmUBlockDepth, func, param, cfg->traversal); + } + + assert(false && "No weight source for this datatype"); + return nullptr; +} + + +template +class EthosUScaleSource : public IVolumeScaleSource +{ +private: + const TYPE *_buffer = nullptr; + const QuantizedScale *_scales = nullptr; + int _biasIndex = 0; + int _biasCount = 0; + int _streamIndex = 0; + int _streams = 0; + Quantization _quantization; + +public: + EthosUScaleSource(int cores, Quantization quantization) : _streams(cores), _quantization(std::move(quantization)) + { + assert(!_quantization.scales.empty()); + // assert that no scale is out of range + auto invalidScale = std::find_if(std::begin(_quantization.scales), std::end(_quantization.scales), + [](const auto q) { return q.shift < 0 || q.shift >= 64; }); + assert(invalidScale == std::end(_quantization.scales)); + } + + int Elements() + { + assert(_biasCount >= 0); + return _biasCount; + } + + int Get(int64_t *biasBuffer, QuantizedScale *quantBuffer, int count) + { + count = std::min(count, _biasCount); + const size_t scaleSize = _quantization.scales.size(); + + for ( int i = 0; i < count; i++ ) + { + int index = _biasIndex + (i * _streams); + *biasBuffer++ = _buffer[index]; + *quantBuffer++ = _quantization.scales[index % scaleSize]; + _biasCount--; + } + + _biasIndex += (count * _streams); + return count; + } + + void SetSource(const void *buffer, int biasCount, int depthOffset, int depthLength, int streamIndex) + { + assert(streamIndex >= 0 && streamIndex < _streams); + UNUSED(biasCount); + assert(depthOffset + depthLength <= biasCount); + assert(uintptr_t(buffer) % alignof(TYPE) == 0); + _buffer = reinterpret_cast(buffer); + _biasIndex = depthOffset + streamIndex; // Where to start in the buffer + _biasCount = (depthLength + _streams - 1 - streamIndex) / _streams; // How many biases to generate + } +}; + + +std::unique_ptr EthosU55WeightEncoder::GetScaleSource( + IWeightEncodingConfig *config, DataType scaleType, const Quantization &explicitQuant) +{ + EthosUEncodingConfig *cfg = static_cast(config); + assert(explicitQuant.type == QuantizationType::EXPLICIT); + + if ( scaleType == DataType::Int32 ) + { + return std::make_unique>(_arch->_cores, explicitQuant); + } + else if ( scaleType == DataType::Int64 && DataTypeSizeBits(cfg->ifmType) == 16 ) + { + return std::make_unique>(_arch->_cores, explicitQuant); + } + + return nullptr; +} + +Quantization EthosU55WeightEncoder::MakeExplicit(const Quantization &ifmQ, const Quantization &weightQ, + const Quantization &ofmQ, DataType scaleType, DataType ifmType) +{ + if ( scaleType == DataType::Int64 && DataTypeSizeBits(ifmType) == 16 ) scaleType = DataType::Int32; + + return ethosU55Scaling::RescalePerChannel(ifmQ, weightQ, ofmQ, scaleType, ifmType); +} + + +WeightsInfo EthosU55WeightEncoder::EncodeWeights( + IWeightEncodingConfig *config, IWeightSource *source, std::vector &result, bool measureOnly) +{ + [[maybe_unused]] EthosUEncodingConfig *cfg = static_cast(config); + assert(cfg->Format() == WeightFormat::Default); + unsigned flags = measureOnly ? MLW_ENCODE_NO_BITSTREAM : MLW_ENCODE_FLAG_NONE; + auto res = mle_encode_proxy(source, 128 * 1024, result, flags); + return {res.elements_read, res.bytes_written, res.zero_count}; +} + + +int EthosU55WeightEncoder::EncodeScales(IWeightEncodingConfig *config, IScaleSource *source, std::vector &result, bool measureOnly) +{ + UNUSED(config); + constexpr int BUFFER_SIZE = 8; + constexpr int SCALE_ELEMENT_SIZE = 10; + + if ( measureOnly ) + { + return source->Elements() * SCALE_ELEMENT_SIZE; // Must be accurate + } + + int64_t scaleBuffer[BUFFER_SIZE]; + QuantizedScale quantBuffer[BUFFER_SIZE]; + + int start = int(result.size()); + int write = start; + result.reserve(start + source->Elements() * SCALE_ELEMENT_SIZE); + while ( true ) + { + int count = source->Get(scaleBuffer, quantBuffer, BUFFER_SIZE); + result.resize(write + (count * SCALE_ELEMENT_SIZE)); + + for ( int i = 0; i < count; i++ ) + { + write += EncodeBias(scaleBuffer[i], quantBuffer[i].scale, quantBuffer[i].shift, &result[write]); + } + + if ( count < BUFFER_SIZE ) + { + break; + } + } + + return write - start; +} + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.hpp b/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.hpp new file mode 100644 index 00000000..24fcffa7 --- /dev/null +++ b/ethosu/regor/architecture/ethosu55/ethos_u55_weight_encoder.hpp @@ -0,0 +1,86 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/architecture.hpp" +#include "architecture/ethos_u_scaling.hpp" +#include "architecture/mlw_encode.hpp" +#include "architecture/weight_encoder.hpp" +#include "common/shape.hpp" +#include "ethos_u55.hpp" + +namespace regor +{ + +/// +/// Encodes weights and biases. Common implementation for Ethos U55 and Ethos U65. +/// +class EthosU55WeightEncoder : public WeightEncoder +{ +private: + struct EthosUEncodingConfig : IWeightEncodingConfig + { + private: + uint32_t _hash = 0; + uint32_t _depthOffsetHash = 0; + int _cores = 0; + + public: + DataType ifmType = DataType::None; + int ofmBlockDepth = 0; + EthosUTraversal traversal = EthosUTraversal::DepthFirst; + std::vector depthOffsets; + Point2i dilation; + Shape ohwiStrides; + + public: + EthosUEncodingConfig(int cores); + void Rehash(); + uint32_t Hash() override; + bool Equals(IWeightEncodingConfig *other) override; + const std::vector &DepthOffsets() override; + Flags Format() override; + }; + +public: + EthosU55WeightEncoder(ArchEthosU55 *arch) : _arch(arch) {} + +public: + std::unique_ptr GetEncodingConfig(ArchitectureOpConfig *opCfg, const WeightsRef &weights, + const Kernel *kernel, DataType ifmType, const std::vector &depthOffsets, Flags format); + + int StreamsRequired(IWeightEncodingConfig *config, const Shape &weightShape, int &scaleStreamsRequired); + + std::unique_ptr GetWeightSource( + IWeightEncodingConfig *config, DataType weightType, WeightTransformFunc func, WeightTransformParam *param); + + std::unique_ptr GetScaleSource(IWeightEncodingConfig *config, DataType scaleType, const Quantization &explicitQuant); + + Quantization MakeExplicit(const Quantization &ifmQ, const Quantization &weightQ, const Quantization &ofmQ, + DataType scaleType, DataType ifmType); + + WeightsInfo EncodeWeights(IWeightEncodingConfig *config, IWeightSource *source, std::vector &result, bool measureOnly); + + int EncodeScales(IWeightEncodingConfig *config, IScaleSource *source, std::vector &result, bool measureOnly); + +private: + ArchEthosU55 *_arch; +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu65/ethos_u65.cpp b/ethosu/regor/architecture/ethosu65/ethos_u65.cpp new file mode 100644 index 00000000..5b7d71c3 --- /dev/null +++ b/ethosu/regor/architecture/ethosu65/ethos_u65.cpp @@ -0,0 +1,91 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u65.hpp" + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "common/numeric_util.hpp" +#include "ethos_u65_register_cs_generator.hpp" + +#include +#include +#include + +namespace regor +{ + +static const EthosU55PerfInfo s_EthosU65PerfInfo[] = { + // Accelerator.Ethos_U65_256 + {{0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25}, {1.0, 0.25, 0.0}}, + // Accelerator.Ethos_U65_512 + {{0.3125, 0.5625, 0.25, 0.1875, 0.25, 0.375, 0.0625, 0.125}, {0.5, 0.125, 0.0}}}; + +static const ArchEthosU55::AcceleratorConfig s_EthosU65Configs[] = { + // Accelerator.Ethos_U65_256 + {256, 1, Shape(2, 2, 8), Shape(2, 2, 8), 48, {8, 8, 8, 8, 16, 8, 16, 20}, 8, &s_EthosU65PerfInfo[0]}, + // Accelerator.Ethos_U65_512 + {256, 2, Shape(2, 2, 8), Shape(2, 2, 8), 48, {8, 8, 8, 8, 16, 8, 16, 20}, 8, &s_EthosU65PerfInfo[1]}, +}; + +ArchEthosU65::ArchEthosU65() +{ + _rcsGenerator = std::make_unique(this); +} + +bool ArchEthosU65::ParseConfig(IniReader *reader) +{ + // Parse architecture configuration + std::string key; + int macs = 0; + int cores = 0; + while ( reader->Begin(key) ) + { + if ( key == "macs" ) + { + macs = reader->Get(); + } + else if ( key == "cores" ) + { + cores = reader->Get(); + } + reader->End(); + } + + // Find the requested MAC configuration for this accelerator + auto cfg = std::find_if(s_EthosU65Configs, std::cend(s_EthosU65Configs), + [&](const AcceleratorConfig &config) { return config.macs == macs && config.cores == cores; }); + if ( cfg == std::cend(s_EthosU65Configs) ) + { + assert(macs == 256 && ((cores == 1) || (cores == 2))); + LOG_TRACE0("Unable to find U65 accelerator for macs={} cores={}", macs, cores); + return false; + } + + ApplyConfig(cfg); + + return true; +} + +std::vector ArchEthosU65::ConfigRegisters() +{ + return std::vector(1, ConfigRegister(1)); +} + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu65/ethos_u65.hpp b/ethosu/regor/architecture/ethosu65/ethos_u65.hpp new file mode 100644 index 00000000..057f5d5b --- /dev/null +++ b/ethosu/regor/architecture/ethosu65/ethos_u65.hpp @@ -0,0 +1,45 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/ethosu55/ethos_u55.hpp" +#include "common/shape.hpp" + +#include + +namespace regor +{ + +/// +/// EthosU65 specialisation (based on U55) +/// +class ArchEthosU65 : public ArchEthosU55 +{ +public: + ArchEthosU65(); + + bool ParseConfig(IniReader *reader) override; + Address MaxAddress() override { return 1LL << 40; } + std::vector ConfigRegisters() override; + +private: + int MaxOutstandingDMAOps() override { return 2; } +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu65/ethos_u65_interface.hpp b/ethosu/regor/architecture/ethosu65/ethos_u65_interface.hpp new file mode 100644 index 00000000..968d4015 --- /dev/null +++ b/ethosu/regor/architecture/ethosu65/ethos_u65_interface.hpp @@ -0,0 +1,21699 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +#if !defined(__cplusplus) || __cplusplus < 201402L +#define CONSTEXPR +#else +#define CONSTEXPR constexpr +#endif + +#ifndef __cplusplus +#define STRUCT struct +#else +#define STRUCT +#endif + +#if defined(__cplusplus) && defined(NPU_DISASSEMBLE) +#include +#include +#include +#endif + +#if defined(__cplusplus) && !defined(NPU_NAMESPACE) +#define NPU_NAMESPACE npu +#endif + +#ifdef __cplusplus +#include +#include +#include +#endif + +#ifdef __cplusplus +namespace NPU_NAMESPACE +{ +#endif +#define NNX_ARCH_VERSION_MAJOR 1 +#define NNX_ARCH_VERSION_MINOR 0 +#define NNX_ARCH_VERSION_PATCH 6 + + + + + +#define NPU_REG_ID 0x0000 +#define NPU_REG_STATUS 0x0004 +#define NPU_REG_CMD 0x0008 +#define NPU_REG_RESET 0x000C +#define NPU_REG_QBASE 0x0010 +#define NPU_REG_QBASE_HI 0x0014 +#define NPU_REG_QREAD 0x0018 +#define NPU_REG_QCONFIG 0x001C +#define NPU_REG_QSIZE 0x0020 +#define NPU_REG_PROT 0x0024 +#define NPU_REG_CONFIG 0x0028 +#define NPU_REG_LOCK 0x002C +#define NPU_REG_REGIONCFG 0x003C +#define NPU_REG_AXI_LIMIT0 0x0040 +#define NPU_REG_AXI_LIMIT1 0x0044 +#define NPU_REG_AXI_LIMIT2 0x0048 +#define NPU_REG_AXI_LIMIT3 0x004C +#define BASE_REGISTERS_SIZE 0x0080 + + + + +#define NPU_REG_BASEP_BASE 0x0080 +#define NPU_REG_BASEP_ARRLEN 0x0008 +#define BASE_POINTERS_REGISTERS_SIZE 0x0100 + + + + +#define NPU_REG_WD_STATUS 0x0100 +#define NPU_REG_MAC_STATUS 0x0104 +#define NPU_REG_AO_STATUS 0x0108 +#define NPU_REG_DMA_STATUS0 0x0110 +#define NPU_REG_DMA_STATUS1 0x0114 +#define NPU_REG_CLKFORCE 0x0140 +#define NPU_REG_DEBUG_ADDRESS 0x0144 +#define NPU_REG_DEBUG_MISC 0x0148 +#define NPU_REG_DEBUGCORE 0x014C +#define NPU_REG_DEBUG_BLOCK 0x0150 +#define DEBUG_REGISTERS_SIZE 0x0180 + + + + +#define NPU_REG_PMCR 0x0180 +#define NPU_REG_PMCNTENSET 0x0184 +#define NPU_REG_PMCNTENCLR 0x0188 +#define NPU_REG_PMOVSSET 0x018C +#define NPU_REG_PMOVSCLR 0x0190 +#define NPU_REG_PMINTSET 0x0194 +#define NPU_REG_PMINTCLR 0x0198 +#define NPU_REG_PMCCNTR 0x01A0 +#define NPU_REG_PMCCNTR_HI 0x01A4 +#define NPU_REG_PMCCNTR_CFG 0x01A8 +#define NPU_REG_PMCAXI_CHAN 0x01AC +#define PMU_REGISTERS_SIZE 0x0200 + + + + +#define NPU_REG_KERNEL_X 0x0200 +#define NPU_REG_KERNEL_Y 0x0204 +#define NPU_REG_KERNEL_W_M1 0x0208 +#define NPU_REG_KERNEL_H_M1 0x020C +#define NPU_REG_OFM_CBLK_WIDTH_M1 0x0210 +#define NPU_REG_OFM_CBLK_HEIGHT_M1 0x0214 +#define NPU_REG_OFM_CBLK_DEPTH_M1 0x0218 +#define NPU_REG_IFM_CBLK_DEPTH_M1 0x021C +#define NPU_REG_OFM_X 0x0220 +#define NPU_REG_OFM_Y 0x0224 +#define NPU_REG_OFM_Z 0x0228 +#define NPU_REG_IFM_Z 0x022C +#define NPU_REG_PAD_TOP 0x0230 +#define NPU_REG_PAD_LEFT 0x0234 +#define NPU_REG_IFM_CBLK_WIDTH 0x0238 +#define NPU_REG_IFM_CBLK_HEIGHT 0x023C +#define NPU_REG_DMA_IFM_SRC 0x0240 +#define NPU_REG_DMA_IFM_SRC_HI 0x0244 +#define NPU_REG_DMA_IFM_DST 0x0248 +#define NPU_REG_DMA_OFM_SRC 0x024C +#define NPU_REG_DMA_OFM_DST 0x0250 +#define NPU_REG_DMA_OFM_DST_HI 0x0254 +#define NPU_REG_DMA_WEIGHT_SRC 0x0258 +#define NPU_REG_DMA_WEIGHT_SRC_HI 0x025C +#define NPU_REG_DMA_CMD_SRC 0x0260 +#define NPU_REG_DMA_CMD_SRC_HI 0x0264 +#define NPU_REG_DMA_CMD_SIZE 0x0268 +#define NPU_REG_DMA_M2M_SRC 0x026C +#define NPU_REG_DMA_M2M_SRC_HI 0x0270 +#define NPU_REG_DMA_M2M_DST 0x0274 +#define NPU_REG_DMA_M2M_DST_HI 0x0278 +#define NPU_REG_CURRENT_QREAD 0x027C +#define NPU_REG_DMA_SCALE_SRC 0x0280 +#define NPU_REG_DMA_SCALE_SRC_HI 0x0284 +#define NPU_REG_CURRENT_BLOCK 0x02B4 +#define NPU_REG_CURRENT_OP 0x02B8 +#define NPU_REG_CURRENT_CMD 0x02BC +#define TSU_DEBUG_REGISTERS_SIZE 0x0300 + + + + +#define NPU_REG_PMEVCNTR_BASE 0x0300 +#define NPU_REG_PMEVCNTR_ARRLEN 0x0004 +#define NPU_REG_PMEVTYPER_BASE 0x0380 +#define NPU_REG_PMEVTYPER_ARRLEN 0x0004 +#define PMU_COUNTERS_REGISTERS_SIZE 0x0400 + + + + +#define NPU_REG_SHARED_BUFFER_BASE 0x0400 +#define NPU_REG_SHARED_BUFFER_ARRLEN 0x0100 +#define SHARED_BUFFER_REGISTERS_SIZE 0x0800 + + + + +#define NPU_REG_IFM_PAD_TOP 0x0800 +#define NPU_REG_IFM_PAD_LEFT 0x0804 +#define NPU_REG_IFM_PAD_RIGHT 0x0808 +#define NPU_REG_IFM_PAD_BOTTOM 0x080C +#define NPU_REG_IFM_DEPTH_M1 0x0810 +#define NPU_REG_IFM_PRECISION 0x0814 +#define NPU_REG_IFM_UPSCALE 0x081C +#define NPU_REG_IFM_ZERO_POINT 0x0824 +#define NPU_REG_IFM_WIDTH0_M1 0x0828 +#define NPU_REG_IFM_HEIGHT0_M1 0x082C +#define NPU_REG_IFM_HEIGHT1_M1 0x0830 +#define NPU_REG_IFM_IB_END 0x0834 +#define NPU_REG_IFM_REGION 0x083C +#define TSU_IFM_REGISTERS_SIZE 0x0840 + + + + +#define NPU_REG_OFM_WIDTH_M1 0x0844 +#define NPU_REG_OFM_HEIGHT_M1 0x0848 +#define NPU_REG_OFM_DEPTH_M1 0x084C +#define NPU_REG_OFM_PRECISION 0x0850 +#define NPU_REG_OFM_BLK_WIDTH_M1 0x0854 +#define NPU_REG_OFM_BLK_HEIGHT_M1 0x0858 +#define NPU_REG_OFM_BLK_DEPTH_M1 0x085C +#define NPU_REG_OFM_ZERO_POINT 0x0860 +#define NPU_REG_OFM_WIDTH0_M1 0x0868 +#define NPU_REG_OFM_HEIGHT0_M1 0x086C +#define NPU_REG_OFM_HEIGHT1_M1 0x0870 +#define NPU_REG_OFM_REGION 0x087C +#define TSU_OFM_REGISTERS_SIZE 0x0880 + + + + +#define NPU_REG_KERNEL_WIDTH_M1 0x0880 +#define NPU_REG_KERNEL_HEIGHT_M1 0x0884 +#define NPU_REG_KERNEL_STRIDE 0x0888 +#define NPU_REG_PARALLEL_MODE 0x088C +#define NPU_REG_ACC_FORMAT 0x0890 +#define NPU_REG_ACTIVATION 0x0894 +#define NPU_REG_ACTIVATION_MIN 0x0898 +#define NPU_REG_ACTIVATION_MAX 0x089C +#define NPU_REG_WEIGHT_REGION 0x08A0 +#define NPU_REG_SCALE_REGION 0x08A4 +#define NPU_REG_AB_START 0x08B4 +#define NPU_REG_BLOCKDEP 0x08BC +#define TSU_KERNEL_REGISTERS_SIZE 0x08C0 + + + + +#define NPU_REG_DMA0_SRC_REGION 0x08C0 +#define NPU_REG_DMA0_DST_REGION 0x08C4 +#define NPU_REG_DMA0_SIZE0 0x08C8 +#define NPU_REG_DMA0_SIZE1 0x08CC +#define TSU_DMA_REGISTERS_SIZE 0x0900 + + + + +#define NPU_REG_IFM2_BROADCAST 0x0900 +#define NPU_REG_IFM2_SCALAR 0x0904 +#define NPU_REG_IFM2_PRECISION 0x0914 +#define NPU_REG_IFM2_ZERO_POINT 0x0924 +#define NPU_REG_IFM2_WIDTH0_M1 0x0928 +#define NPU_REG_IFM2_HEIGHT0_M1 0x092C +#define NPU_REG_IFM2_HEIGHT1_M1 0x0930 +#define NPU_REG_IFM2_IB_START 0x0934 +#define NPU_REG_IFM2_REGION 0x093C +#define TSU_IFM2_REGISTERS_SIZE 0x0940 + + + + +#define NPU_REG_IFM_BASE0 0x0A00 +#define NPU_REG_IFM_BASE0_HI 0x0A04 +#define NPU_REG_IFM_BASE1 0x0A08 +#define NPU_REG_IFM_BASE1_HI 0x0A0C +#define NPU_REG_IFM_BASE2 0x0A10 +#define NPU_REG_IFM_BASE2_HI 0x0A14 +#define NPU_REG_IFM_BASE3 0x0A18 +#define NPU_REG_IFM_BASE3_HI 0x0A1C +#define NPU_REG_IFM_STRIDE_X 0x0A20 +#define NPU_REG_IFM_STRIDE_X_HI 0x0A24 +#define NPU_REG_IFM_STRIDE_Y 0x0A28 +#define NPU_REG_IFM_STRIDE_Y_HI 0x0A2C +#define NPU_REG_IFM_STRIDE_C 0x0A30 +#define NPU_REG_IFM_STRIDE_C_HI 0x0A34 +#define TSU_IFM_BASE_REGISTERS_SIZE 0x0A40 + + + + +#define NPU_REG_OFM_BASE0 0x0A40 +#define NPU_REG_OFM_BASE0_HI 0x0A44 +#define NPU_REG_OFM_BASE1 0x0A48 +#define NPU_REG_OFM_BASE1_HI 0x0A4C +#define NPU_REG_OFM_BASE2 0x0A50 +#define NPU_REG_OFM_BASE2_HI 0x0A54 +#define NPU_REG_OFM_BASE3 0x0A58 +#define NPU_REG_OFM_BASE3_HI 0x0A5C +#define NPU_REG_OFM_STRIDE_X 0x0A60 +#define NPU_REG_OFM_STRIDE_X_HI 0x0A64 +#define NPU_REG_OFM_STRIDE_Y 0x0A68 +#define NPU_REG_OFM_STRIDE_Y_HI 0x0A6C +#define NPU_REG_OFM_STRIDE_C 0x0A70 +#define NPU_REG_OFM_STRIDE_C_HI 0x0A74 +#define TSU_OFM_BASE_REGISTERS_SIZE 0x0A80 + + + + +#define NPU_REG_WEIGHT_BASE 0x0A80 +#define NPU_REG_WEIGHT_BASE_HI 0x0A84 +#define NPU_REG_WEIGHT_LENGTH 0x0A88 +#define NPU_REG_WEIGHT_LENGTH_HI 0x0A8C +#define NPU_REG_SCALE_BASE 0x0A90 +#define NPU_REG_SCALE_BASE_HI 0x0A94 +#define NPU_REG_SCALE_LENGTH 0x0A98 +#define NPU_REG_SCALE_LENGTH_HI 0x0A9C +#define NPU_REG_OFM_SCALE 0x0AA0 +#define NPU_REG_OFM_SCALE_HI 0x0AA4 +#define NPU_REG_OPA_SCALE 0x0AA8 +#define NPU_REG_OPA_SCALE_HI 0x0AAC +#define NPU_REG_OPB_SCALE 0x0AB0 +#define TSU_WS_BASE_REGISTERS_SIZE 0x0AC0 + + + + +#define NPU_REG_DMA0_SRC 0x0AC0 +#define NPU_REG_DMA0_SRC_HI 0x0AC4 +#define NPU_REG_DMA0_DST 0x0AC8 +#define NPU_REG_DMA0_DST_HI 0x0ACC +#define NPU_REG_DMA0_LEN 0x0AD0 +#define NPU_REG_DMA0_LEN_HI 0x0AD4 +#define NPU_REG_DMA0_SKIP0 0x0AD8 +#define NPU_REG_DMA0_SKIP0_HI 0x0ADC +#define NPU_REG_DMA0_SKIP1 0x0AE0 +#define NPU_REG_DMA0_SKIP1_HI 0x0AE4 +#define TSU_DMA_BASE_REGISTERS_SIZE 0x0B00 + + + + +#define NPU_REG_IFM2_BASE0 0x0B00 +#define NPU_REG_IFM2_BASE0_HI 0x0B04 +#define NPU_REG_IFM2_BASE1 0x0B08 +#define NPU_REG_IFM2_BASE1_HI 0x0B0C +#define NPU_REG_IFM2_BASE2 0x0B10 +#define NPU_REG_IFM2_BASE2_HI 0x0B14 +#define NPU_REG_IFM2_BASE3 0x0B18 +#define NPU_REG_IFM2_BASE3_HI 0x0B1C +#define NPU_REG_IFM2_STRIDE_X 0x0B20 +#define NPU_REG_IFM2_STRIDE_X_HI 0x0B24 +#define NPU_REG_IFM2_STRIDE_Y 0x0B28 +#define NPU_REG_IFM2_STRIDE_Y_HI 0x0B2C +#define NPU_REG_IFM2_STRIDE_C 0x0B30 +#define NPU_REG_IFM2_STRIDE_C_HI 0x0B34 +#define TSU_IFM2_BASE_REGISTERS_SIZE 0x0B40 + + + + +#define NPU_REG_WEIGHT1_BASE 0x0B40 +#define NPU_REG_WEIGHT1_BASE_HI 0x0B44 +#define NPU_REG_WEIGHT1_LENGTH 0x0B48 +#define NPU_REG_WEIGHT1_LENGTH_HI 0x0B4C +#define NPU_REG_SCALE1_BASE 0x0B50 +#define NPU_REG_SCALE1_BASE_HI 0x0B54 +#define NPU_REG_SCALE1_LENGTH 0x0B58 +#define NPU_REG_SCALE1_LENGTH_HI 0x0B5C +#define TSU_WS1_BASE_REGISTERS_SIZE 0x0B80 + + + + +#define TSU_USER_BASE_REGISTERS_SIZE 0x0BC0 + + + + +#define TSU_DMA_EBASE_REGISTERS_SIZE 0x0C00 + + + + +#define NPU_REG_REVISION 0x0FC0 +#define NPU_REG_PID4 0x0FD0 +#define NPU_REG_PID5 0x0FD4 +#define NPU_REG_PID6 0x0FD8 +#define NPU_REG_PID7 0x0FDC +#define NPU_REG_PID0 0x0FE0 +#define NPU_REG_PID1 0x0FE4 +#define NPU_REG_PID2 0x0FE8 +#define NPU_REG_PID3 0x0FEC +#define NPU_REG_CID0 0x0FF0 +#define NPU_REG_CID1 0x0FF4 +#define NPU_REG_CID2 0x0FF8 +#define NPU_REG_CID3 0x0FFC +#define ID_REGISTERS_SIZE 0x1000 + +#ifdef __cplusplus + +enum class acc_format : uint8_t +{ + I32 = 0, + I40 = 1, + F16 = 2, +}; + +enum class activation_clip_range : uint8_t +{ + OFM_PRECISION = 0, + FORCE_UINT8 = 2, + FORCE_INT8 = 3, + FORCE_INT16 = 5, +}; + +enum class activation_format : uint8_t +{ + NHWC = 0, + NHCWB16 = 1, +}; + +enum class activation_function : uint8_t +{ + RELU = 0, + TANH = 3, + SIGMOID = 4, + TABLE_0 = 16, + TABLE_1 = 17, + TABLE_2 = 18, + TABLE_3 = 19, + TABLE_4 = 20, + TABLE_5 = 21, + TABLE_6 = 22, + TABLE_7 = 23, +}; + +enum class activation_precision : uint8_t +{ + B8 = 0, + B16 = 1, + B32 = 2, + B64 = 3, +}; + +enum class activation_type : uint8_t +{ + UNSIGNED = 0, + SIGNED = 1, +}; + +enum class axi_mem_encoding : uint8_t +{ + DEVICE_NON_BUFFERABLE = 0, + DEVICE_BUFFERABLE = 1, + NORMAL_NON_CACHEABLE_NON_BUFFERABLE = 2, + NORMAL_NON_CACHEABLE_BUFFERABLE = 3, + WRITE_THROUGH_NO_ALLOCATE = 4, + WRITE_THROUGH_READ_ALLOCATE = 5, + WRITE_THROUGH_WRITE_ALLOCATE = 6, + WRITE_THROUGH_READ_AND_WRITE_ALLOCATE = 7, + WRITE_BACK_NO_ALLOCATE = 8, + WRITE_BACK_READ_ALLOCATE = 9, + WRITE_BACK_WRITE_ALLOCATE = 10, + WRITE_BACK_READ_AND_WRITE_ALLOCATE = 11, +}; + +enum class broadcast_mode : uint8_t +{ + DISABLE = 0, + ENABLE = 1, +}; + +enum class cmd0_opcode : uint16_t +{ + NPU_OP_STOP = 0, + NPU_OP_IRQ = 1, + NPU_OP_CONV = 2, + NPU_OP_DEPTHWISE = 3, + NPU_OP_POOL = 5, + NPU_OP_ELEMENTWISE = 6, + NPU_OP_DMA_START = 16, + NPU_OP_DMA_WAIT = 17, + NPU_OP_KERNEL_WAIT = 18, + NPU_OP_PMU_MASK = 19, + NPU_SET_IFM_PAD_TOP = 256, + NPU_SET_IFM_PAD_LEFT = 257, + NPU_SET_IFM_PAD_RIGHT = 258, + NPU_SET_IFM_PAD_BOTTOM = 259, + NPU_SET_IFM_DEPTH_M1 = 260, + NPU_SET_IFM_PRECISION = 261, + NPU_SET_IFM_UPSCALE = 263, + NPU_SET_IFM_ZERO_POINT = 265, + NPU_SET_IFM_WIDTH0_M1 = 266, + NPU_SET_IFM_HEIGHT0_M1 = 267, + NPU_SET_IFM_HEIGHT1_M1 = 268, + NPU_SET_IFM_IB_END = 269, + NPU_SET_IFM_REGION = 271, + NPU_SET_OFM_WIDTH_M1 = 273, + NPU_SET_OFM_HEIGHT_M1 = 274, + NPU_SET_OFM_DEPTH_M1 = 275, + NPU_SET_OFM_PRECISION = 276, + NPU_SET_OFM_BLK_WIDTH_M1 = 277, + NPU_SET_OFM_BLK_HEIGHT_M1 = 278, + NPU_SET_OFM_BLK_DEPTH_M1 = 279, + NPU_SET_OFM_ZERO_POINT = 280, + NPU_SET_OFM_WIDTH0_M1 = 282, + NPU_SET_OFM_HEIGHT0_M1 = 283, + NPU_SET_OFM_HEIGHT1_M1 = 284, + NPU_SET_OFM_REGION = 287, + NPU_SET_KERNEL_WIDTH_M1 = 288, + NPU_SET_KERNEL_HEIGHT_M1 = 289, + NPU_SET_KERNEL_STRIDE = 290, + NPU_SET_PARALLEL_MODE = 291, + NPU_SET_ACC_FORMAT = 292, + NPU_SET_ACTIVATION = 293, + NPU_SET_ACTIVATION_MIN = 294, + NPU_SET_ACTIVATION_MAX = 295, + NPU_SET_WEIGHT_REGION = 296, + NPU_SET_SCALE_REGION = 297, + NPU_SET_AB_START = 301, + NPU_SET_BLOCKDEP = 303, + NPU_SET_DMA0_SRC_REGION = 304, + NPU_SET_DMA0_DST_REGION = 305, + NPU_SET_DMA0_SIZE0 = 306, + NPU_SET_DMA0_SIZE1 = 307, + NPU_SET_IFM2_BROADCAST = 384, + NPU_SET_IFM2_SCALAR = 385, + NPU_SET_IFM2_PRECISION = 389, + NPU_SET_IFM2_ZERO_POINT = 393, + NPU_SET_IFM2_WIDTH0_M1 = 394, + NPU_SET_IFM2_HEIGHT0_M1 = 395, + NPU_SET_IFM2_HEIGHT1_M1 = 396, + NPU_SET_IFM2_IB_START = 397, + NPU_SET_IFM2_REGION = 399, +}; + +enum class cmd1_opcode : uint16_t +{ + NPU_SET_IFM_BASE0 = 0, + NPU_SET_IFM_BASE1 = 1, + NPU_SET_IFM_BASE2 = 2, + NPU_SET_IFM_BASE3 = 3, + NPU_SET_IFM_STRIDE_X = 4, + NPU_SET_IFM_STRIDE_Y = 5, + NPU_SET_IFM_STRIDE_C = 6, + NPU_SET_OFM_BASE0 = 16, + NPU_SET_OFM_BASE1 = 17, + NPU_SET_OFM_BASE2 = 18, + NPU_SET_OFM_BASE3 = 19, + NPU_SET_OFM_STRIDE_X = 20, + NPU_SET_OFM_STRIDE_Y = 21, + NPU_SET_OFM_STRIDE_C = 22, + NPU_SET_WEIGHT_BASE = 32, + NPU_SET_WEIGHT_LENGTH = 33, + NPU_SET_SCALE_BASE = 34, + NPU_SET_SCALE_LENGTH = 35, + NPU_SET_OFM_SCALE = 36, + NPU_SET_OPA_SCALE = 37, + NPU_SET_OPB_SCALE = 38, + NPU_SET_DMA0_SRC = 48, + NPU_SET_DMA0_DST = 49, + NPU_SET_DMA0_LEN = 50, + NPU_SET_DMA0_SKIP0 = 51, + NPU_SET_DMA0_SKIP1 = 52, + NPU_SET_IFM2_BASE0 = 128, + NPU_SET_IFM2_BASE1 = 129, + NPU_SET_IFM2_BASE2 = 130, + NPU_SET_IFM2_BASE3 = 131, + NPU_SET_IFM2_STRIDE_X = 132, + NPU_SET_IFM2_STRIDE_Y = 133, + NPU_SET_IFM2_STRIDE_C = 134, + NPU_SET_WEIGHT1_BASE = 144, + NPU_SET_WEIGHT1_LENGTH = 145, + NPU_SET_SCALE1_BASE = 146, + NPU_SET_SCALE1_LENGTH = 147, +}; + +enum class cmd_ctrl : uint8_t +{ + CMD0_CTRL = 0, + CMD1_CTRL = 1, +}; + +enum class custom_dma : uint8_t +{ + NOT_IMPLEMENTED = 0, + IMPLEMENTED = 1, +}; + +enum class dma_fault_channel : uint8_t +{ + CMD_READ = 0, + IFM_READ = 1, + WEIGHT_READ = 2, + SBS_READ = 3, + MEM2MEM_READ = 4, + OFM_WRITE = 8, + MEM2MEM_WRITE = 9, +}; + +enum class dma_fault_src : uint8_t +{ + AXI_M0 = 0, + AXI_M1 = 1, +}; + +enum class dma_region_mode : uint8_t +{ + EXTERNAL = 0, + INTERNAL = 1, +}; + +enum class dma_stride_mode : uint8_t +{ + D1 = 0, + D2 = 1, + D3 = 2, +}; + +enum class elementwise_mode : uint8_t +{ + MUL = 0, + ADD = 1, + SUB = 2, + MIN = 3, + MAX = 4, + LRELU = 5, + ABS = 6, + CLZ = 7, + SHR = 8, + SHL = 9, +}; + +enum class ifm2_operand_order : uint8_t +{ + ORDER_B = 0, + ORDER_A = 1, +}; + +enum class ifm_scale_mode : uint8_t +{ + OPA_OPB_16 = 0, + OPA_32 = 1, + OPB_32 = 2, +}; + +enum class ifm_upscale_mode : uint8_t +{ + NONE = 0, + NEAREST = 1, + ZEROS = 2, +}; + +enum class kernel_decomposition : uint8_t +{ + D8X8 = 0, + D4X4 = 1, +}; + +enum class kernel_dilation : uint8_t +{ + NONE = 0, + X2 = 1, +}; + +enum class max_beats : uint8_t +{ + B64 = 0, + B128 = 1, + B256 = 2, +}; + +enum class mem_attr : uint8_t +{ + AXI0_OUTSTANDING_COUNTER0 = 0, + AXI0_OUTSTANDING_COUNTER1 = 1, + AXI1_OUTSTANDING_COUNTER2 = 2, + AXI1_OUTSTANDING_COUNTER3 = 3, +}; + +enum class ofm_scale_mode : uint8_t +{ + PER_CHANNEL = 0, + GLOBAL = 1, +}; + +enum class parallel_mode : uint8_t +{ + SINGLE_CORE = 0, + DUAL_CORE_DEPTH = 1, +}; + +enum class pmu_axi_channel : uint8_t +{ + RD_CMD = 0, + RD_IFM = 1, + RD_WEIGHTS = 2, + RD_SCALE_BIAS = 3, + RD_MEM2MEM = 4, + WR_OFM = 8, + WR_MEM2MEM = 9, +}; + +enum class pmu_event : uint16_t +{ + NO_EVENT = 0, + CYCLE = 17, + NPU_IDLE = 32, + CC_STALLED_ON_BLOCKDEP = 33, + CC_STALLED_ON_SHRAM_RECONFIG = 34, + NPU_ACTIVE = 35, + MAC_ACTIVE = 48, + MAC_ACTIVE_8BIT = 49, + MAC_ACTIVE_16BIT = 50, + MAC_DPU_ACTIVE = 51, + MAC_STALLED_BY_WD_ACC = 52, + MAC_STALLED_BY_WD = 53, + MAC_STALLED_BY_ACC = 54, + MAC_STALLED_BY_IB = 55, + MAC_ACTIVE_32BIT = 56, + MAC_STALLED_BY_INT_W = 57, + MAC_STALLED_BY_INT_ACC = 58, + AO_ACTIVE = 64, + AO_ACTIVE_8BIT = 65, + AO_ACTIVE_16BIT = 66, + AO_STALLED_BY_OFMP_OB = 67, + AO_STALLED_BY_OFMP = 68, + AO_STALLED_BY_OB = 69, + AO_STALLED_BY_ACC_IB = 70, + AO_STALLED_BY_ACC = 71, + AO_STALLED_BY_IB = 72, + WD_ACTIVE = 80, + WD_STALLED = 81, + WD_STALLED_BY_WS = 82, + WD_STALLED_BY_WD_BUF = 83, + WD_PARSE_ACTIVE = 84, + WD_PARSE_STALLED = 85, + WD_PARSE_STALLED_IN = 86, + WD_PARSE_STALLED_OUT = 87, + WD_TRANS_WS = 88, + WD_TRANS_WB = 89, + WD_TRANS_DW0 = 90, + WD_TRANS_DW1 = 91, + AXI0_RD_TRANS_ACCEPTED = 128, + AXI0_RD_TRANS_COMPLETED = 129, + AXI0_RD_DATA_BEAT_RECEIVED = 130, + AXI0_RD_TRAN_REQ_STALLED = 131, + AXI0_WR_TRANS_ACCEPTED = 132, + AXI0_WR_TRANS_COMPLETED_M = 133, + AXI0_WR_TRANS_COMPLETED_S = 134, + AXI0_WR_DATA_BEAT_WRITTEN = 135, + AXI0_WR_TRAN_REQ_STALLED = 136, + AXI0_WR_DATA_BEAT_STALLED = 137, + AXI0_ENABLED_CYCLES = 140, + AXI0_RD_STALL_LIMIT = 142, + AXI0_WR_STALL_LIMIT = 143, + AXI_LATENCY_ANY = 160, + AXI_LATENCY_32 = 161, + AXI_LATENCY_64 = 162, + AXI_LATENCY_128 = 163, + AXI_LATENCY_256 = 164, + AXI_LATENCY_512 = 165, + AXI_LATENCY_1024 = 166, + ECC_DMA = 176, + ECC_SB0 = 177, + AXI1_RD_TRANS_ACCEPTED = 384, + AXI1_RD_TRANS_COMPLETED = 385, + AXI1_RD_DATA_BEAT_RECEIVED = 386, + AXI1_RD_TRAN_REQ_STALLED = 387, + AXI1_WR_TRANS_ACCEPTED = 388, + AXI1_WR_TRANS_COMPLETED_M = 389, + AXI1_WR_TRANS_COMPLETED_S = 390, + AXI1_WR_DATA_BEAT_WRITTEN = 391, + AXI1_WR_TRAN_REQ_STALLED = 392, + AXI1_WR_DATA_BEAT_STALLED = 393, + AXI1_ENABLED_CYCLES = 396, + AXI1_RD_STALL_LIMIT = 398, + AXI1_WR_STALL_LIMIT = 399, + ECC_SB1 = 433, +}; + +enum class pooling_mode : uint8_t +{ + MAX = 0, + AVERAGE = 1, + REDUCE_SUM = 2, +}; + +enum class privilege_level : uint8_t +{ + USER = 0, + PRIVILEGED = 1, +}; + +enum class round_mode : uint8_t +{ + DBL = 0, + TRUNCATE = 1, + NATURAL = 2, +}; + +enum class security_level : uint8_t +{ + SECURE = 0, + NON_SECURE = 1, +}; + +enum class state : uint8_t +{ + STOPPED = 0, + RUNNING = 1, +}; + +enum class wd_core_slice_state : uint8_t +{ + HEADER = 0, + PALETTE = 1, + WEIGHTS = 2, +}; + +enum class wd_ctrl_state : uint8_t +{ + IDLE = 0, + DRAIN = 1, + OFD_INIT = 2, + OFD_RUN = 3, +}; + +enum class weight_order : uint8_t +{ + DEPTH_FIRST = 0, + PART_KERNEL_FIRST = 1, +}; + +#else + +enum acc_format +{ + ACC_FORMAT_I32 = 0, + ACC_FORMAT_I40 = 1, + ACC_FORMAT_F16 = 2, +}; + +enum activation_clip_range +{ + ACTIVATION_CLIP_RANGE_OFM_PRECISION = 0, + ACTIVATION_CLIP_RANGE_FORCE_UINT8 = 2, + ACTIVATION_CLIP_RANGE_FORCE_INT8 = 3, + ACTIVATION_CLIP_RANGE_FORCE_INT16 = 5, +}; + +enum activation_format +{ + ACTIVATION_FORMAT_NHWC = 0, + ACTIVATION_FORMAT_NHCWB16 = 1, +}; + +enum activation_function +{ + ACTIVATION_FUNCTION_RELU = 0, + ACTIVATION_FUNCTION_TANH = 3, + ACTIVATION_FUNCTION_SIGMOID = 4, + ACTIVATION_FUNCTION_TABLE_0 = 16, + ACTIVATION_FUNCTION_TABLE_1 = 17, + ACTIVATION_FUNCTION_TABLE_2 = 18, + ACTIVATION_FUNCTION_TABLE_3 = 19, + ACTIVATION_FUNCTION_TABLE_4 = 20, + ACTIVATION_FUNCTION_TABLE_5 = 21, + ACTIVATION_FUNCTION_TABLE_6 = 22, + ACTIVATION_FUNCTION_TABLE_7 = 23, +}; + +enum activation_precision +{ + ACTIVATION_PRECISION_B8 = 0, + ACTIVATION_PRECISION_B16 = 1, + ACTIVATION_PRECISION_B32 = 2, + ACTIVATION_PRECISION_B64 = 3, +}; + +enum activation_type +{ + ACTIVATION_TYPE_UNSIGNED = 0, + ACTIVATION_TYPE_SIGNED = 1, +}; + +enum axi_mem_encoding +{ + AXI_MEM_ENCODING_DEVICE_NON_BUFFERABLE = 0, + AXI_MEM_ENCODING_DEVICE_BUFFERABLE = 1, + AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_NON_BUFFERABLE = 2, + AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_BUFFERABLE = 3, + AXI_MEM_ENCODING_WRITE_THROUGH_NO_ALLOCATE = 4, + AXI_MEM_ENCODING_WRITE_THROUGH_READ_ALLOCATE = 5, + AXI_MEM_ENCODING_WRITE_THROUGH_WRITE_ALLOCATE = 6, + AXI_MEM_ENCODING_WRITE_THROUGH_READ_AND_WRITE_ALLOCATE = 7, + AXI_MEM_ENCODING_WRITE_BACK_NO_ALLOCATE = 8, + AXI_MEM_ENCODING_WRITE_BACK_READ_ALLOCATE = 9, + AXI_MEM_ENCODING_WRITE_BACK_WRITE_ALLOCATE = 10, + AXI_MEM_ENCODING_WRITE_BACK_READ_AND_WRITE_ALLOCATE = 11, +}; + +enum broadcast_mode +{ + BROADCAST_MODE_DISABLE = 0, + BROADCAST_MODE_ENABLE = 1, +}; + +enum cmd0_opcode +{ + CMD0_OPCODE_NPU_OP_STOP = 0, + CMD0_OPCODE_NPU_OP_IRQ = 1, + CMD0_OPCODE_NPU_OP_CONV = 2, + CMD0_OPCODE_NPU_OP_DEPTHWISE = 3, + CMD0_OPCODE_NPU_OP_POOL = 5, + CMD0_OPCODE_NPU_OP_ELEMENTWISE = 6, + CMD0_OPCODE_NPU_OP_DMA_START = 16, + CMD0_OPCODE_NPU_OP_DMA_WAIT = 17, + CMD0_OPCODE_NPU_OP_KERNEL_WAIT = 18, + CMD0_OPCODE_NPU_OP_PMU_MASK = 19, + CMD0_OPCODE_NPU_SET_IFM_PAD_TOP = 256, + CMD0_OPCODE_NPU_SET_IFM_PAD_LEFT = 257, + CMD0_OPCODE_NPU_SET_IFM_PAD_RIGHT = 258, + CMD0_OPCODE_NPU_SET_IFM_PAD_BOTTOM = 259, + CMD0_OPCODE_NPU_SET_IFM_DEPTH_M1 = 260, + CMD0_OPCODE_NPU_SET_IFM_PRECISION = 261, + CMD0_OPCODE_NPU_SET_IFM_UPSCALE = 263, + CMD0_OPCODE_NPU_SET_IFM_ZERO_POINT = 265, + CMD0_OPCODE_NPU_SET_IFM_WIDTH0_M1 = 266, + CMD0_OPCODE_NPU_SET_IFM_HEIGHT0_M1 = 267, + CMD0_OPCODE_NPU_SET_IFM_HEIGHT1_M1 = 268, + CMD0_OPCODE_NPU_SET_IFM_IB_END = 269, + CMD0_OPCODE_NPU_SET_IFM_REGION = 271, + CMD0_OPCODE_NPU_SET_OFM_WIDTH_M1 = 273, + CMD0_OPCODE_NPU_SET_OFM_HEIGHT_M1 = 274, + CMD0_OPCODE_NPU_SET_OFM_DEPTH_M1 = 275, + CMD0_OPCODE_NPU_SET_OFM_PRECISION = 276, + CMD0_OPCODE_NPU_SET_OFM_BLK_WIDTH_M1 = 277, + CMD0_OPCODE_NPU_SET_OFM_BLK_HEIGHT_M1 = 278, + CMD0_OPCODE_NPU_SET_OFM_BLK_DEPTH_M1 = 279, + CMD0_OPCODE_NPU_SET_OFM_ZERO_POINT = 280, + CMD0_OPCODE_NPU_SET_OFM_WIDTH0_M1 = 282, + CMD0_OPCODE_NPU_SET_OFM_HEIGHT0_M1 = 283, + CMD0_OPCODE_NPU_SET_OFM_HEIGHT1_M1 = 284, + CMD0_OPCODE_NPU_SET_OFM_REGION = 287, + CMD0_OPCODE_NPU_SET_KERNEL_WIDTH_M1 = 288, + CMD0_OPCODE_NPU_SET_KERNEL_HEIGHT_M1 = 289, + CMD0_OPCODE_NPU_SET_KERNEL_STRIDE = 290, + CMD0_OPCODE_NPU_SET_PARALLEL_MODE = 291, + CMD0_OPCODE_NPU_SET_ACC_FORMAT = 292, + CMD0_OPCODE_NPU_SET_ACTIVATION = 293, + CMD0_OPCODE_NPU_SET_ACTIVATION_MIN = 294, + CMD0_OPCODE_NPU_SET_ACTIVATION_MAX = 295, + CMD0_OPCODE_NPU_SET_WEIGHT_REGION = 296, + CMD0_OPCODE_NPU_SET_SCALE_REGION = 297, + CMD0_OPCODE_NPU_SET_AB_START = 301, + CMD0_OPCODE_NPU_SET_BLOCKDEP = 303, + CMD0_OPCODE_NPU_SET_DMA0_SRC_REGION = 304, + CMD0_OPCODE_NPU_SET_DMA0_DST_REGION = 305, + CMD0_OPCODE_NPU_SET_DMA0_SIZE0 = 306, + CMD0_OPCODE_NPU_SET_DMA0_SIZE1 = 307, + CMD0_OPCODE_NPU_SET_IFM2_BROADCAST = 384, + CMD0_OPCODE_NPU_SET_IFM2_SCALAR = 385, + CMD0_OPCODE_NPU_SET_IFM2_PRECISION = 389, + CMD0_OPCODE_NPU_SET_IFM2_ZERO_POINT = 393, + CMD0_OPCODE_NPU_SET_IFM2_WIDTH0_M1 = 394, + CMD0_OPCODE_NPU_SET_IFM2_HEIGHT0_M1 = 395, + CMD0_OPCODE_NPU_SET_IFM2_HEIGHT1_M1 = 396, + CMD0_OPCODE_NPU_SET_IFM2_IB_START = 397, + CMD0_OPCODE_NPU_SET_IFM2_REGION = 399, +}; + +enum cmd1_opcode +{ + CMD1_OPCODE_NPU_SET_IFM_BASE0 = 0, + CMD1_OPCODE_NPU_SET_IFM_BASE1 = 1, + CMD1_OPCODE_NPU_SET_IFM_BASE2 = 2, + CMD1_OPCODE_NPU_SET_IFM_BASE3 = 3, + CMD1_OPCODE_NPU_SET_IFM_STRIDE_X = 4, + CMD1_OPCODE_NPU_SET_IFM_STRIDE_Y = 5, + CMD1_OPCODE_NPU_SET_IFM_STRIDE_C = 6, + CMD1_OPCODE_NPU_SET_OFM_BASE0 = 16, + CMD1_OPCODE_NPU_SET_OFM_BASE1 = 17, + CMD1_OPCODE_NPU_SET_OFM_BASE2 = 18, + CMD1_OPCODE_NPU_SET_OFM_BASE3 = 19, + CMD1_OPCODE_NPU_SET_OFM_STRIDE_X = 20, + CMD1_OPCODE_NPU_SET_OFM_STRIDE_Y = 21, + CMD1_OPCODE_NPU_SET_OFM_STRIDE_C = 22, + CMD1_OPCODE_NPU_SET_WEIGHT_BASE = 32, + CMD1_OPCODE_NPU_SET_WEIGHT_LENGTH = 33, + CMD1_OPCODE_NPU_SET_SCALE_BASE = 34, + CMD1_OPCODE_NPU_SET_SCALE_LENGTH = 35, + CMD1_OPCODE_NPU_SET_OFM_SCALE = 36, + CMD1_OPCODE_NPU_SET_OPA_SCALE = 37, + CMD1_OPCODE_NPU_SET_OPB_SCALE = 38, + CMD1_OPCODE_NPU_SET_DMA0_SRC = 48, + CMD1_OPCODE_NPU_SET_DMA0_DST = 49, + CMD1_OPCODE_NPU_SET_DMA0_LEN = 50, + CMD1_OPCODE_NPU_SET_DMA0_SKIP0 = 51, + CMD1_OPCODE_NPU_SET_DMA0_SKIP1 = 52, + CMD1_OPCODE_NPU_SET_IFM2_BASE0 = 128, + CMD1_OPCODE_NPU_SET_IFM2_BASE1 = 129, + CMD1_OPCODE_NPU_SET_IFM2_BASE2 = 130, + CMD1_OPCODE_NPU_SET_IFM2_BASE3 = 131, + CMD1_OPCODE_NPU_SET_IFM2_STRIDE_X = 132, + CMD1_OPCODE_NPU_SET_IFM2_STRIDE_Y = 133, + CMD1_OPCODE_NPU_SET_IFM2_STRIDE_C = 134, + CMD1_OPCODE_NPU_SET_WEIGHT1_BASE = 144, + CMD1_OPCODE_NPU_SET_WEIGHT1_LENGTH = 145, + CMD1_OPCODE_NPU_SET_SCALE1_BASE = 146, + CMD1_OPCODE_NPU_SET_SCALE1_LENGTH = 147, +}; + +enum cmd_ctrl +{ + CMD_CTRL_CMD0_CTRL = 0, + CMD_CTRL_CMD1_CTRL = 1, +}; + +enum custom_dma +{ + CUSTOM_DMA_NOT_IMPLEMENTED = 0, + CUSTOM_DMA_IMPLEMENTED = 1, +}; + +enum dma_fault_channel +{ + DMA_FAULT_CHANNEL_CMD_READ = 0, + DMA_FAULT_CHANNEL_IFM_READ = 1, + DMA_FAULT_CHANNEL_WEIGHT_READ = 2, + DMA_FAULT_CHANNEL_SBS_READ = 3, + DMA_FAULT_CHANNEL_MEM2MEM_READ = 4, + DMA_FAULT_CHANNEL_OFM_WRITE = 8, + DMA_FAULT_CHANNEL_MEM2MEM_WRITE = 9, +}; + +enum dma_fault_src +{ + DMA_FAULT_SRC_AXI_M0 = 0, + DMA_FAULT_SRC_AXI_M1 = 1, +}; + +enum dma_region_mode +{ + DMA_REGION_MODE_EXTERNAL = 0, + DMA_REGION_MODE_INTERNAL = 1, +}; + +enum dma_stride_mode +{ + DMA_STRIDE_MODE_D1 = 0, + DMA_STRIDE_MODE_D2 = 1, + DMA_STRIDE_MODE_D3 = 2, +}; + +enum elementwise_mode +{ + ELEMENTWISE_MODE_MUL = 0, + ELEMENTWISE_MODE_ADD = 1, + ELEMENTWISE_MODE_SUB = 2, + ELEMENTWISE_MODE_MIN = 3, + ELEMENTWISE_MODE_MAX = 4, + ELEMENTWISE_MODE_LRELU = 5, + ELEMENTWISE_MODE_ABS = 6, + ELEMENTWISE_MODE_CLZ = 7, + ELEMENTWISE_MODE_SHR = 8, + ELEMENTWISE_MODE_SHL = 9, +}; + +enum ifm2_operand_order +{ + IFM2_OPERAND_ORDER_ORDER_B = 0, + IFM2_OPERAND_ORDER_ORDER_A = 1, +}; + +enum ifm_scale_mode +{ + IFM_SCALE_MODE_OPA_OPB_16 = 0, + IFM_SCALE_MODE_OPA_32 = 1, + IFM_SCALE_MODE_OPB_32 = 2, +}; + +enum ifm_upscale_mode +{ + IFM_UPSCALE_MODE_NONE = 0, + IFM_UPSCALE_MODE_NEAREST = 1, + IFM_UPSCALE_MODE_ZEROS = 2, +}; + +enum kernel_decomposition +{ + KERNEL_DECOMPOSITION_D8X8 = 0, + KERNEL_DECOMPOSITION_D4X4 = 1, +}; + +enum kernel_dilation +{ + KERNEL_DILATION_NONE = 0, + KERNEL_DILATION_X2 = 1, +}; + +enum max_beats +{ + MAX_BEATS_B64 = 0, + MAX_BEATS_B128 = 1, + MAX_BEATS_B256 = 2, +}; + +enum mem_attr +{ + MEM_ATTR_AXI0_OUTSTANDING_COUNTER0 = 0, + MEM_ATTR_AXI0_OUTSTANDING_COUNTER1 = 1, + MEM_ATTR_AXI1_OUTSTANDING_COUNTER2 = 2, + MEM_ATTR_AXI1_OUTSTANDING_COUNTER3 = 3, +}; + +enum ofm_scale_mode +{ + OFM_SCALE_MODE_PER_CHANNEL = 0, + OFM_SCALE_MODE_GLOBAL = 1, +}; + +enum parallel_mode +{ + PARALLEL_MODE_SINGLE_CORE = 0, + PARALLEL_MODE_DUAL_CORE_DEPTH = 1, +}; + +enum pmu_axi_channel +{ + PMU_AXI_CHANNEL_RD_CMD = 0, + PMU_AXI_CHANNEL_RD_IFM = 1, + PMU_AXI_CHANNEL_RD_WEIGHTS = 2, + PMU_AXI_CHANNEL_RD_SCALE_BIAS = 3, + PMU_AXI_CHANNEL_RD_MEM2MEM = 4, + PMU_AXI_CHANNEL_WR_OFM = 8, + PMU_AXI_CHANNEL_WR_MEM2MEM = 9, +}; + +enum pmu_event +{ + PMU_EVENT_NO_EVENT = 0, + PMU_EVENT_CYCLE = 17, + PMU_EVENT_NPU_IDLE = 32, + PMU_EVENT_CC_STALLED_ON_BLOCKDEP = 33, + PMU_EVENT_CC_STALLED_ON_SHRAM_RECONFIG = 34, + PMU_EVENT_NPU_ACTIVE = 35, + PMU_EVENT_MAC_ACTIVE = 48, + PMU_EVENT_MAC_ACTIVE_8BIT = 49, + PMU_EVENT_MAC_ACTIVE_16BIT = 50, + PMU_EVENT_MAC_DPU_ACTIVE = 51, + PMU_EVENT_MAC_STALLED_BY_WD_ACC = 52, + PMU_EVENT_MAC_STALLED_BY_WD = 53, + PMU_EVENT_MAC_STALLED_BY_ACC = 54, + PMU_EVENT_MAC_STALLED_BY_IB = 55, + PMU_EVENT_MAC_ACTIVE_32BIT = 56, + PMU_EVENT_MAC_STALLED_BY_INT_W = 57, + PMU_EVENT_MAC_STALLED_BY_INT_ACC = 58, + PMU_EVENT_AO_ACTIVE = 64, + PMU_EVENT_AO_ACTIVE_8BIT = 65, + PMU_EVENT_AO_ACTIVE_16BIT = 66, + PMU_EVENT_AO_STALLED_BY_OFMP_OB = 67, + PMU_EVENT_AO_STALLED_BY_OFMP = 68, + PMU_EVENT_AO_STALLED_BY_OB = 69, + PMU_EVENT_AO_STALLED_BY_ACC_IB = 70, + PMU_EVENT_AO_STALLED_BY_ACC = 71, + PMU_EVENT_AO_STALLED_BY_IB = 72, + PMU_EVENT_WD_ACTIVE = 80, + PMU_EVENT_WD_STALLED = 81, + PMU_EVENT_WD_STALLED_BY_WS = 82, + PMU_EVENT_WD_STALLED_BY_WD_BUF = 83, + PMU_EVENT_WD_PARSE_ACTIVE = 84, + PMU_EVENT_WD_PARSE_STALLED = 85, + PMU_EVENT_WD_PARSE_STALLED_IN = 86, + PMU_EVENT_WD_PARSE_STALLED_OUT = 87, + PMU_EVENT_WD_TRANS_WS = 88, + PMU_EVENT_WD_TRANS_WB = 89, + PMU_EVENT_WD_TRANS_DW0 = 90, + PMU_EVENT_WD_TRANS_DW1 = 91, + PMU_EVENT_AXI0_RD_TRANS_ACCEPTED = 128, + PMU_EVENT_AXI0_RD_TRANS_COMPLETED = 129, + PMU_EVENT_AXI0_RD_DATA_BEAT_RECEIVED = 130, + PMU_EVENT_AXI0_RD_TRAN_REQ_STALLED = 131, + PMU_EVENT_AXI0_WR_TRANS_ACCEPTED = 132, + PMU_EVENT_AXI0_WR_TRANS_COMPLETED_M = 133, + PMU_EVENT_AXI0_WR_TRANS_COMPLETED_S = 134, + PMU_EVENT_AXI0_WR_DATA_BEAT_WRITTEN = 135, + PMU_EVENT_AXI0_WR_TRAN_REQ_STALLED = 136, + PMU_EVENT_AXI0_WR_DATA_BEAT_STALLED = 137, + PMU_EVENT_AXI0_ENABLED_CYCLES = 140, + PMU_EVENT_AXI0_RD_STALL_LIMIT = 142, + PMU_EVENT_AXI0_WR_STALL_LIMIT = 143, + PMU_EVENT_AXI_LATENCY_ANY = 160, + PMU_EVENT_AXI_LATENCY_32 = 161, + PMU_EVENT_AXI_LATENCY_64 = 162, + PMU_EVENT_AXI_LATENCY_128 = 163, + PMU_EVENT_AXI_LATENCY_256 = 164, + PMU_EVENT_AXI_LATENCY_512 = 165, + PMU_EVENT_AXI_LATENCY_1024 = 166, + PMU_EVENT_ECC_DMA = 176, + PMU_EVENT_ECC_SB0 = 177, + PMU_EVENT_AXI1_RD_TRANS_ACCEPTED = 384, + PMU_EVENT_AXI1_RD_TRANS_COMPLETED = 385, + PMU_EVENT_AXI1_RD_DATA_BEAT_RECEIVED = 386, + PMU_EVENT_AXI1_RD_TRAN_REQ_STALLED = 387, + PMU_EVENT_AXI1_WR_TRANS_ACCEPTED = 388, + PMU_EVENT_AXI1_WR_TRANS_COMPLETED_M = 389, + PMU_EVENT_AXI1_WR_TRANS_COMPLETED_S = 390, + PMU_EVENT_AXI1_WR_DATA_BEAT_WRITTEN = 391, + PMU_EVENT_AXI1_WR_TRAN_REQ_STALLED = 392, + PMU_EVENT_AXI1_WR_DATA_BEAT_STALLED = 393, + PMU_EVENT_AXI1_ENABLED_CYCLES = 396, + PMU_EVENT_AXI1_RD_STALL_LIMIT = 398, + PMU_EVENT_AXI1_WR_STALL_LIMIT = 399, + PMU_EVENT_ECC_SB1 = 433, +}; + +enum pooling_mode +{ + POOLING_MODE_MAX = 0, + POOLING_MODE_AVERAGE = 1, + POOLING_MODE_REDUCE_SUM = 2, +}; + +enum privilege_level +{ + PRIVILEGE_LEVEL_USER = 0, + PRIVILEGE_LEVEL_PRIVILEGED = 1, +}; + +enum round_mode +{ + ROUND_MODE_DBL = 0, + ROUND_MODE_TRUNCATE = 1, + ROUND_MODE_NATURAL = 2, +}; + +enum security_level +{ + SECURITY_LEVEL_SECURE = 0, + SECURITY_LEVEL_NON_SECURE = 1, +}; + +enum state +{ + STATE_STOPPED = 0, + STATE_RUNNING = 1, +}; + +enum wd_core_slice_state +{ + WD_CORE_SLICE_STATE_HEADER = 0, + WD_CORE_SLICE_STATE_PALETTE = 1, + WD_CORE_SLICE_STATE_WEIGHTS = 2, +}; + +enum wd_ctrl_state +{ + WD_CTRL_STATE_IDLE = 0, + WD_CTRL_STATE_DRAIN = 1, + WD_CTRL_STATE_OFD_INIT = 2, + WD_CTRL_STATE_OFD_RUN = 3, +}; + +enum weight_order +{ + WEIGHT_ORDER_DEPTH_FIRST = 0, + WEIGHT_ORDER_PART_KERNEL_FIRST = 1, +}; + +#endif + +#ifdef NPU_DISASSEMBLE + +static const char* acc_format_str[] = +{ + "ACC_FORMAT_I32", + "ACC_FORMAT_I40", + "ACC_FORMAT_F16", +}; + +static const char* activation_clip_range_str[] = +{ + "ACTIVATION_CLIP_RANGE_OFM_PRECISION", + "****", + "ACTIVATION_CLIP_RANGE_FORCE_UINT8", + "ACTIVATION_CLIP_RANGE_FORCE_INT8", + "****", + "ACTIVATION_CLIP_RANGE_FORCE_INT16", +}; + +static const char* activation_format_str[] = +{ + "ACTIVATION_FORMAT_NHWC", + "ACTIVATION_FORMAT_NHCWB16", +}; + +static const char* activation_function_str[] = +{ + "ACTIVATION_FUNCTION_RELU", + "****", + "****", + "ACTIVATION_FUNCTION_TANH", + "ACTIVATION_FUNCTION_SIGMOID", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "ACTIVATION_FUNCTION_TABLE_0", + "ACTIVATION_FUNCTION_TABLE_1", + "ACTIVATION_FUNCTION_TABLE_2", + "ACTIVATION_FUNCTION_TABLE_3", + "ACTIVATION_FUNCTION_TABLE_4", + "ACTIVATION_FUNCTION_TABLE_5", + "ACTIVATION_FUNCTION_TABLE_6", + "ACTIVATION_FUNCTION_TABLE_7", +}; + +static const char* activation_precision_str[] = +{ + "ACTIVATION_PRECISION_B8", + "ACTIVATION_PRECISION_B16", + "ACTIVATION_PRECISION_B32", + "ACTIVATION_PRECISION_B64", +}; + +static const char* activation_type_str[] = +{ + "ACTIVATION_TYPE_UNSIGNED", + "ACTIVATION_TYPE_SIGNED", +}; + +static const char* axi_mem_encoding_str[] = +{ + "AXI_MEM_ENCODING_DEVICE_NON_BUFFERABLE", + "AXI_MEM_ENCODING_DEVICE_BUFFERABLE", + "AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_NON_BUFFERABLE", + "AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_BUFFERABLE", + "AXI_MEM_ENCODING_WRITE_THROUGH_NO_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_THROUGH_READ_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_THROUGH_WRITE_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_THROUGH_READ_AND_WRITE_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_BACK_NO_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_BACK_READ_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_BACK_WRITE_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_BACK_READ_AND_WRITE_ALLOCATE", +}; + +static const char* broadcast_mode_str[] = +{ + "BROADCAST_MODE_DISABLE", + "BROADCAST_MODE_ENABLE", +}; + +static const char* cmd0_opcode_str[] = +{ + "CMD0_OPCODE_NPU_OP_STOP", + "CMD0_OPCODE_NPU_OP_IRQ", + "CMD0_OPCODE_NPU_OP_CONV", + "CMD0_OPCODE_NPU_OP_DEPTHWISE", + "****", + "CMD0_OPCODE_NPU_OP_POOL", + "CMD0_OPCODE_NPU_OP_ELEMENTWISE", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_OP_DMA_START", + "CMD0_OPCODE_NPU_OP_DMA_WAIT", + "CMD0_OPCODE_NPU_OP_KERNEL_WAIT", + "CMD0_OPCODE_NPU_OP_PMU_MASK", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_SET_IFM_PAD_TOP", + "CMD0_OPCODE_NPU_SET_IFM_PAD_LEFT", + "CMD0_OPCODE_NPU_SET_IFM_PAD_RIGHT", + "CMD0_OPCODE_NPU_SET_IFM_PAD_BOTTOM", + "CMD0_OPCODE_NPU_SET_IFM_DEPTH_M1", + "CMD0_OPCODE_NPU_SET_IFM_PRECISION", + "****", + "CMD0_OPCODE_NPU_SET_IFM_UPSCALE", + "****", + "CMD0_OPCODE_NPU_SET_IFM_ZERO_POINT", + "CMD0_OPCODE_NPU_SET_IFM_WIDTH0_M1", + "CMD0_OPCODE_NPU_SET_IFM_HEIGHT0_M1", + "CMD0_OPCODE_NPU_SET_IFM_HEIGHT1_M1", + "CMD0_OPCODE_NPU_SET_IFM_IB_END", + "****", + "CMD0_OPCODE_NPU_SET_IFM_REGION", + "****", + "CMD0_OPCODE_NPU_SET_OFM_WIDTH_M1", + "CMD0_OPCODE_NPU_SET_OFM_HEIGHT_M1", + "CMD0_OPCODE_NPU_SET_OFM_DEPTH_M1", + "CMD0_OPCODE_NPU_SET_OFM_PRECISION", + "CMD0_OPCODE_NPU_SET_OFM_BLK_WIDTH_M1", + "CMD0_OPCODE_NPU_SET_OFM_BLK_HEIGHT_M1", + "CMD0_OPCODE_NPU_SET_OFM_BLK_DEPTH_M1", + "CMD0_OPCODE_NPU_SET_OFM_ZERO_POINT", + "****", + "CMD0_OPCODE_NPU_SET_OFM_WIDTH0_M1", + "CMD0_OPCODE_NPU_SET_OFM_HEIGHT0_M1", + "CMD0_OPCODE_NPU_SET_OFM_HEIGHT1_M1", + "****", + "****", + "CMD0_OPCODE_NPU_SET_OFM_REGION", + "CMD0_OPCODE_NPU_SET_KERNEL_WIDTH_M1", + "CMD0_OPCODE_NPU_SET_KERNEL_HEIGHT_M1", + "CMD0_OPCODE_NPU_SET_KERNEL_STRIDE", + "CMD0_OPCODE_NPU_SET_PARALLEL_MODE", + "CMD0_OPCODE_NPU_SET_ACC_FORMAT", + "CMD0_OPCODE_NPU_SET_ACTIVATION", + "CMD0_OPCODE_NPU_SET_ACTIVATION_MIN", + "CMD0_OPCODE_NPU_SET_ACTIVATION_MAX", + "CMD0_OPCODE_NPU_SET_WEIGHT_REGION", + "CMD0_OPCODE_NPU_SET_SCALE_REGION", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_SET_AB_START", + "****", + "CMD0_OPCODE_NPU_SET_BLOCKDEP", + "CMD0_OPCODE_NPU_SET_DMA0_SRC_REGION", + "CMD0_OPCODE_NPU_SET_DMA0_DST_REGION", + "CMD0_OPCODE_NPU_SET_DMA0_SIZE0", + "CMD0_OPCODE_NPU_SET_DMA0_SIZE1", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_SET_IFM2_BROADCAST", + "CMD0_OPCODE_NPU_SET_IFM2_SCALAR", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_SET_IFM2_PRECISION", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_SET_IFM2_ZERO_POINT", + "CMD0_OPCODE_NPU_SET_IFM2_WIDTH0_M1", + "CMD0_OPCODE_NPU_SET_IFM2_HEIGHT0_M1", + "CMD0_OPCODE_NPU_SET_IFM2_HEIGHT1_M1", + "CMD0_OPCODE_NPU_SET_IFM2_IB_START", + "****", + "CMD0_OPCODE_NPU_SET_IFM2_REGION", +}; + +static const char* cmd1_opcode_str[] = +{ + "CMD1_OPCODE_NPU_SET_IFM_BASE0", + "CMD1_OPCODE_NPU_SET_IFM_BASE1", + "CMD1_OPCODE_NPU_SET_IFM_BASE2", + "CMD1_OPCODE_NPU_SET_IFM_BASE3", + "CMD1_OPCODE_NPU_SET_IFM_STRIDE_X", + "CMD1_OPCODE_NPU_SET_IFM_STRIDE_Y", + "CMD1_OPCODE_NPU_SET_IFM_STRIDE_C", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_SET_OFM_BASE0", + "CMD1_OPCODE_NPU_SET_OFM_BASE1", + "CMD1_OPCODE_NPU_SET_OFM_BASE2", + "CMD1_OPCODE_NPU_SET_OFM_BASE3", + "CMD1_OPCODE_NPU_SET_OFM_STRIDE_X", + "CMD1_OPCODE_NPU_SET_OFM_STRIDE_Y", + "CMD1_OPCODE_NPU_SET_OFM_STRIDE_C", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_SET_WEIGHT_BASE", + "CMD1_OPCODE_NPU_SET_WEIGHT_LENGTH", + "CMD1_OPCODE_NPU_SET_SCALE_BASE", + "CMD1_OPCODE_NPU_SET_SCALE_LENGTH", + "CMD1_OPCODE_NPU_SET_OFM_SCALE", + "CMD1_OPCODE_NPU_SET_OPA_SCALE", + "CMD1_OPCODE_NPU_SET_OPB_SCALE", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_SET_DMA0_SRC", + "CMD1_OPCODE_NPU_SET_DMA0_DST", + "CMD1_OPCODE_NPU_SET_DMA0_LEN", + "CMD1_OPCODE_NPU_SET_DMA0_SKIP0", + "CMD1_OPCODE_NPU_SET_DMA0_SKIP1", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_SET_IFM2_BASE0", + "CMD1_OPCODE_NPU_SET_IFM2_BASE1", + "CMD1_OPCODE_NPU_SET_IFM2_BASE2", + "CMD1_OPCODE_NPU_SET_IFM2_BASE3", + "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_X", + "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_Y", + "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_C", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_SET_WEIGHT1_BASE", + "CMD1_OPCODE_NPU_SET_WEIGHT1_LENGTH", + "CMD1_OPCODE_NPU_SET_SCALE1_BASE", + "CMD1_OPCODE_NPU_SET_SCALE1_LENGTH", +}; + +static const char* cmd_ctrl_str[] = +{ + "CMD_CTRL_CMD0_CTRL", + "CMD_CTRL_CMD1_CTRL", +}; + +static const char* custom_dma_str[] = +{ + "CUSTOM_DMA_NOT_IMPLEMENTED", + "CUSTOM_DMA_IMPLEMENTED", +}; + +static const char* dma_fault_channel_str[] = +{ + "DMA_FAULT_CHANNEL_CMD_READ", + "DMA_FAULT_CHANNEL_IFM_READ", + "DMA_FAULT_CHANNEL_WEIGHT_READ", + "DMA_FAULT_CHANNEL_SBS_READ", + "DMA_FAULT_CHANNEL_MEM2MEM_READ", + "****", + "****", + "****", + "DMA_FAULT_CHANNEL_OFM_WRITE", + "DMA_FAULT_CHANNEL_MEM2MEM_WRITE", +}; + +static const char* dma_fault_src_str[] = +{ + "DMA_FAULT_SRC_AXI_M0", + "DMA_FAULT_SRC_AXI_M1", +}; + +static const char* dma_region_mode_str[] = +{ + "DMA_REGION_MODE_EXTERNAL", + "DMA_REGION_MODE_INTERNAL", +}; + +static const char* dma_stride_mode_str[] = +{ + "DMA_STRIDE_MODE_D1", + "DMA_STRIDE_MODE_D2", + "DMA_STRIDE_MODE_D3", +}; + +static const char* elementwise_mode_str[] = +{ + "ELEMENTWISE_MODE_MUL", + "ELEMENTWISE_MODE_ADD", + "ELEMENTWISE_MODE_SUB", + "ELEMENTWISE_MODE_MIN", + "ELEMENTWISE_MODE_MAX", + "ELEMENTWISE_MODE_LRELU", + "ELEMENTWISE_MODE_ABS", + "ELEMENTWISE_MODE_CLZ", + "ELEMENTWISE_MODE_SHR", + "ELEMENTWISE_MODE_SHL", +}; + +static const char* ifm2_operand_order_str[] = +{ + "IFM2_OPERAND_ORDER_ORDER_B", + "IFM2_OPERAND_ORDER_ORDER_A", +}; + +static const char* ifm_scale_mode_str[] = +{ + "IFM_SCALE_MODE_OPA_OPB_16", + "IFM_SCALE_MODE_OPA_32", + "IFM_SCALE_MODE_OPB_32", +}; + +static const char* ifm_upscale_mode_str[] = +{ + "IFM_UPSCALE_MODE_NONE", + "IFM_UPSCALE_MODE_NEAREST", + "IFM_UPSCALE_MODE_ZEROS", +}; + +static const char* kernel_decomposition_str[] = +{ + "KERNEL_DECOMPOSITION_D8X8", + "KERNEL_DECOMPOSITION_D4X4", +}; + +static const char* kernel_dilation_str[] = +{ + "KERNEL_DILATION_NONE", + "KERNEL_DILATION_X2", +}; + +static const char* max_beats_str[] = +{ + "MAX_BEATS_B64", + "MAX_BEATS_B128", + "MAX_BEATS_B256", +}; + +static const char* mem_attr_str[] = +{ + "MEM_ATTR_AXI0_OUTSTANDING_COUNTER0", + "MEM_ATTR_AXI0_OUTSTANDING_COUNTER1", + "MEM_ATTR_AXI1_OUTSTANDING_COUNTER2", + "MEM_ATTR_AXI1_OUTSTANDING_COUNTER3", +}; + +static const char* ofm_scale_mode_str[] = +{ + "OFM_SCALE_MODE_PER_CHANNEL", + "OFM_SCALE_MODE_GLOBAL", +}; + +static const char* parallel_mode_str[] = +{ + "PARALLEL_MODE_SINGLE_CORE", + "PARALLEL_MODE_DUAL_CORE_DEPTH", +}; + +static const char* pmu_axi_channel_str[] = +{ + "PMU_AXI_CHANNEL_RD_CMD", + "PMU_AXI_CHANNEL_RD_IFM", + "PMU_AXI_CHANNEL_RD_WEIGHTS", + "PMU_AXI_CHANNEL_RD_SCALE_BIAS", + "PMU_AXI_CHANNEL_RD_MEM2MEM", + "****", + "****", + "****", + "PMU_AXI_CHANNEL_WR_OFM", + "PMU_AXI_CHANNEL_WR_MEM2MEM", +}; + +static const char* pmu_event_str[] = +{ + "PMU_EVENT_NO_EVENT", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_CYCLE", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_NPU_IDLE", + "PMU_EVENT_CC_STALLED_ON_BLOCKDEP", + "PMU_EVENT_CC_STALLED_ON_SHRAM_RECONFIG", + "PMU_EVENT_NPU_ACTIVE", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_MAC_ACTIVE", + "PMU_EVENT_MAC_ACTIVE_8BIT", + "PMU_EVENT_MAC_ACTIVE_16BIT", + "PMU_EVENT_MAC_DPU_ACTIVE", + "PMU_EVENT_MAC_STALLED_BY_WD_ACC", + "PMU_EVENT_MAC_STALLED_BY_WD", + "PMU_EVENT_MAC_STALLED_BY_ACC", + "PMU_EVENT_MAC_STALLED_BY_IB", + "PMU_EVENT_MAC_ACTIVE_32BIT", + "PMU_EVENT_MAC_STALLED_BY_INT_W", + "PMU_EVENT_MAC_STALLED_BY_INT_ACC", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_AO_ACTIVE", + "PMU_EVENT_AO_ACTIVE_8BIT", + "PMU_EVENT_AO_ACTIVE_16BIT", + "PMU_EVENT_AO_STALLED_BY_OFMP_OB", + "PMU_EVENT_AO_STALLED_BY_OFMP", + "PMU_EVENT_AO_STALLED_BY_OB", + "PMU_EVENT_AO_STALLED_BY_ACC_IB", + "PMU_EVENT_AO_STALLED_BY_ACC", + "PMU_EVENT_AO_STALLED_BY_IB", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_WD_ACTIVE", + "PMU_EVENT_WD_STALLED", + "PMU_EVENT_WD_STALLED_BY_WS", + "PMU_EVENT_WD_STALLED_BY_WD_BUF", + "PMU_EVENT_WD_PARSE_ACTIVE", + "PMU_EVENT_WD_PARSE_STALLED", + "PMU_EVENT_WD_PARSE_STALLED_IN", + "PMU_EVENT_WD_PARSE_STALLED_OUT", + "PMU_EVENT_WD_TRANS_WS", + "PMU_EVENT_WD_TRANS_WB", + "PMU_EVENT_WD_TRANS_DW0", + "PMU_EVENT_WD_TRANS_DW1", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_AXI0_RD_TRANS_ACCEPTED", + "PMU_EVENT_AXI0_RD_TRANS_COMPLETED", + "PMU_EVENT_AXI0_RD_DATA_BEAT_RECEIVED", + "PMU_EVENT_AXI0_RD_TRAN_REQ_STALLED", + "PMU_EVENT_AXI0_WR_TRANS_ACCEPTED", + "PMU_EVENT_AXI0_WR_TRANS_COMPLETED_M", + "PMU_EVENT_AXI0_WR_TRANS_COMPLETED_S", + "PMU_EVENT_AXI0_WR_DATA_BEAT_WRITTEN", + "PMU_EVENT_AXI0_WR_TRAN_REQ_STALLED", + "PMU_EVENT_AXI0_WR_DATA_BEAT_STALLED", + "****", + "****", + "PMU_EVENT_AXI0_ENABLED_CYCLES", + "****", + "PMU_EVENT_AXI0_RD_STALL_LIMIT", + "PMU_EVENT_AXI0_WR_STALL_LIMIT", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_AXI_LATENCY_ANY", + "PMU_EVENT_AXI_LATENCY_32", + "PMU_EVENT_AXI_LATENCY_64", + "PMU_EVENT_AXI_LATENCY_128", + "PMU_EVENT_AXI_LATENCY_256", + "PMU_EVENT_AXI_LATENCY_512", + "PMU_EVENT_AXI_LATENCY_1024", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_ECC_DMA", + "PMU_EVENT_ECC_SB0", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_AXI1_RD_TRANS_ACCEPTED", + "PMU_EVENT_AXI1_RD_TRANS_COMPLETED", + "PMU_EVENT_AXI1_RD_DATA_BEAT_RECEIVED", + "PMU_EVENT_AXI1_RD_TRAN_REQ_STALLED", + "PMU_EVENT_AXI1_WR_TRANS_ACCEPTED", + "PMU_EVENT_AXI1_WR_TRANS_COMPLETED_M", + "PMU_EVENT_AXI1_WR_TRANS_COMPLETED_S", + "PMU_EVENT_AXI1_WR_DATA_BEAT_WRITTEN", + "PMU_EVENT_AXI1_WR_TRAN_REQ_STALLED", + "PMU_EVENT_AXI1_WR_DATA_BEAT_STALLED", + "****", + "****", + "PMU_EVENT_AXI1_ENABLED_CYCLES", + "****", + "PMU_EVENT_AXI1_RD_STALL_LIMIT", + "PMU_EVENT_AXI1_WR_STALL_LIMIT", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_ECC_SB1", +}; + +static const char* pooling_mode_str[] = +{ + "POOLING_MODE_MAX", + "POOLING_MODE_AVERAGE", + "POOLING_MODE_REDUCE_SUM", +}; + +static const char* privilege_level_str[] = +{ + "PRIVILEGE_LEVEL_USER", + "PRIVILEGE_LEVEL_PRIVILEGED", +}; + +static const char* round_mode_str[] = +{ + "ROUND_MODE_DBL", + "ROUND_MODE_TRUNCATE", + "ROUND_MODE_NATURAL", +}; + +static const char* security_level_str[] = +{ + "SECURITY_LEVEL_SECURE", + "SECURITY_LEVEL_NON_SECURE", +}; + +static const char* state_str[] = +{ + "STATE_STOPPED", + "STATE_RUNNING", +}; + +static const char* wd_core_slice_state_str[] = +{ + "WD_CORE_SLICE_STATE_HEADER", + "WD_CORE_SLICE_STATE_PALETTE", + "WD_CORE_SLICE_STATE_WEIGHTS", +}; + +static const char* wd_ctrl_state_str[] = +{ + "WD_CTRL_STATE_IDLE", + "WD_CTRL_STATE_DRAIN", + "WD_CTRL_STATE_OFD_INIT", + "WD_CTRL_STATE_OFD_RUN", +}; + +static const char* weight_order_str[] = +{ + "WEIGHT_ORDER_DEPTH_FIRST", + "WEIGHT_ORDER_PART_KERNEL_FIRST", +}; + +#endif + + + +struct id_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t version_status : 4; + uint32_t version_minor : 4; + uint32_t version_major : 4; + uint32_t product_major : 4; + uint32_t arch_patch_rev : 4; + uint32_t arch_minor_rev : 8; + uint32_t arch_major_rev : 4; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR id_r() : + word0(268853249) + {} + CONSTEXPR id_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + id_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_version_status() const + { + auto v = ((1U << 4) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR id_r& set_version_status(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_version_minor() const + { + auto v = ((1U << 4) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR id_r& set_version_minor(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_version_major() const + { + auto v = ((1U << 4) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR id_r& set_version_major(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<8) & word0) | ((((1U << 4) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_product_major() const + { + auto v = ((1U << 4) - 1) & (word0 >> 12); + return v; + } + CONSTEXPR id_r& set_product_major(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<12) & word0) | ((((1U << 4) - 1) & value) << 12); + return *this; + } + CONSTEXPR uint32_t get_arch_patch_rev() const + { + auto v = ((1U << 4) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR id_r& set_arch_patch_rev(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<16) & word0) | ((((1U << 4) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_arch_minor_rev() const + { + auto v = ((1U << 8) - 1) & (word0 >> 20); + return v; + } + CONSTEXPR id_r& set_arch_minor_rev(uint32_t value) + { + word0 = (~(((1U << 8) - 1)<<20) & word0) | ((((1U << 8) - 1) & value) << 20); + return *this; + } + CONSTEXPR uint32_t get_arch_major_rev() const + { + auto v = ((1U << 4) - 1) & (word0 >> 28); + return v; + } + CONSTEXPR id_r& set_arch_major_rev(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<28) & word0) | ((((1U << 4) - 1) & value) << 28); + return *this; + } +#endif +}; + + +struct status_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t state : 1; + uint32_t irq_raised : 1; + uint32_t bus_status : 1; + uint32_t reset_status : 1; + uint32_t cmd_parse_error : 1; + uint32_t cmd_end_reached : 1; + uint32_t pmu_irq_raised : 1; + uint32_t wd_fault : 1; + uint32_t ecc_fault : 1; + uint32_t reserved0 : 2; + uint32_t faulting_interface : 1; + uint32_t faulting_channel : 4; + uint32_t irq_history_mask : 16; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR status_r() : + word0(8) + {} + CONSTEXPR status_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + status_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::state get_state() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR status_r& set_state(NPU_NAMESPACE::state value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR uint32_t get_irq_raised() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR status_r& set_irq_raised(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_bus_status() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR status_r& set_bus_status(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_reset_status() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR status_r& set_reset_status(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_cmd_parse_error() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR status_r& set_cmd_parse_error(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_cmd_end_reached() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR status_r& set_cmd_end_reached(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_pmu_irq_raised() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR status_r& set_pmu_irq_raised(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_wd_fault() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR status_r& set_wd_fault(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_ecc_fault() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR status_r& set_ecc_fault(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_fault_src get_faulting_interface() const + { + auto v = ((1U << 1) - 1) & (word0 >> 11); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR status_r& set_faulting_interface(NPU_NAMESPACE::dma_fault_src value) + { + word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 11); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_fault_channel get_faulting_channel() const + { + auto v = ((1U << 4) - 1) & (word0 >> 12); + assert(v <= 9); + return static_cast(v); + } + CONSTEXPR status_r& set_faulting_channel(NPU_NAMESPACE::dma_fault_channel value) + { + word0 = (~(((1U << 4) - 1)<<12) & word0) | ((((1U << 4) - 1) & static_cast(value)) << 12); + return *this; + } + CONSTEXPR uint32_t get_irq_history_mask() const + { + auto v = ((1U << 16) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR status_r& set_irq_history_mask(uint32_t value) + { + word0 = (~(((1U << 16) - 1)<<16) & word0) | ((((1U << 16) - 1) & value) << 16); + return *this; + } +#endif +}; + + +struct cmd_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t transition_to_running_state : 1; + uint32_t clear_irq : 1; + uint32_t clock_q_enable : 1; + uint32_t power_q_enable : 1; + uint32_t stop_request : 1; + uint32_t reserved0 : 11; + uint32_t clear_irq_history : 16; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cmd_r() : + word0(12) + {} + CONSTEXPR cmd_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cmd_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_transition_to_running_state() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR cmd_r& set_transition_to_running_state(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_clear_irq() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR cmd_r& set_clear_irq(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_clock_q_enable() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR cmd_r& set_clock_q_enable(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_power_q_enable() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR cmd_r& set_power_q_enable(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_stop_request() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR cmd_r& set_stop_request(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_clear_irq_history() const + { + auto v = ((1U << 16) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR cmd_r& set_clear_irq_history(uint32_t value) + { + word0 = (~(((1U << 16) - 1)<<16) & word0) | ((((1U << 16) - 1) & value) << 16); + return *this; + } +#endif +}; + + +struct reset_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t pending_CPL : 1; + uint32_t pending_CSL : 1; + uint32_t reserved0 : 30; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR reset_r() : + word0(0) + {} + CONSTEXPR reset_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + reset_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::privilege_level get_pending_CPL() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR reset_r& set_pending_CPL(NPU_NAMESPACE::privilege_level value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::security_level get_pending_CSL() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR reset_r& set_pending_CSL(NPU_NAMESPACE::security_level value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 1); + return *this; + } +#endif +}; + + +struct qbase_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR qbase_r() : + word0(0), + word1(0) + {} + CONSTEXPR qbase_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + qbase_r copy() + { + return *this; + } +#endif +}; + + +struct qread_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t QREAD : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR qread_r() : + word0(0) + {} + CONSTEXPR qread_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + qread_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_QREAD() const + { + auto v = word0; + return v; + } + CONSTEXPR qread_r& set_QREAD(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct qconfig_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t cmd_region0 : 2; + uint32_t reserved0 : 30; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR qconfig_r() : + word0(0) + {} + CONSTEXPR qconfig_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + qconfig_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::mem_attr get_cmd_region0() const + { + auto v = ((1U << 2) - 1) & (word0 >> 0); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR qconfig_r& set_cmd_region0(NPU_NAMESPACE::mem_attr value) + { + word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 0); + return *this; + } +#endif +}; + + +struct qsize_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t QSIZE : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR qsize_r() : + word0(0) + {} + CONSTEXPR qsize_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + qsize_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_QSIZE() const + { + auto v = word0; + return v; + } + CONSTEXPR qsize_r& set_QSIZE(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct prot_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t active_CPL : 1; + uint32_t active_CSL : 1; + uint32_t reserved0 : 30; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR prot_r() : + word0(0) + {} + CONSTEXPR prot_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + prot_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::privilege_level get_active_CPL() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR prot_r& set_active_CPL(NPU_NAMESPACE::privilege_level value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::security_level get_active_CSL() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR prot_r& set_active_CSL(NPU_NAMESPACE::security_level value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 1); + return *this; + } +#endif +}; + + +struct config_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t macs_per_cc : 4; + uint32_t cmd_stream_version : 4; + uint32_t shram_size : 8; + uint32_t reserved0 : 11; + uint32_t custom_dma : 1; + uint32_t product : 4; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR config_r() : + word0(268435456) + {} + CONSTEXPR config_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + config_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_macs_per_cc() const + { + auto v = ((1U << 4) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR config_r& set_macs_per_cc(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_cmd_stream_version() const + { + auto v = ((1U << 4) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR config_r& set_cmd_stream_version(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_shram_size() const + { + auto v = ((1U << 8) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR config_r& set_shram_size(uint32_t value) + { + word0 = (~(((1U << 8) - 1)<<8) & word0) | ((((1U << 8) - 1) & value) << 8); + return *this; + } + CONSTEXPR NPU_NAMESPACE::custom_dma get_custom_dma() const + { + auto v = ((1U << 1) - 1) & (word0 >> 27); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR config_r& set_custom_dma(NPU_NAMESPACE::custom_dma value) + { + word0 = (~(((1U << 1) - 1)<<27) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 27); + return *this; + } + CONSTEXPR uint32_t get_product() const + { + auto v = ((1U << 4) - 1) & (word0 >> 28); + return v; + } + CONSTEXPR config_r& set_product(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<28) & word0) | ((((1U << 4) - 1) & value) << 28); + return *this; + } +#endif +}; + + +struct lock_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t LOCK : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR lock_r() : + word0(0) + {} + CONSTEXPR lock_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + lock_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_LOCK() const + { + auto v = word0; + return v; + } + CONSTEXPR lock_r& set_LOCK(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct regioncfg_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t region0 : 2; + uint32_t region1 : 2; + uint32_t region2 : 2; + uint32_t region3 : 2; + uint32_t region4 : 2; + uint32_t region5 : 2; + uint32_t region6 : 2; + uint32_t region7 : 2; + uint32_t reserved0 : 16; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR regioncfg_r() : + word0(0) + {} + CONSTEXPR regioncfg_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + regioncfg_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::mem_attr get_region0() const + { + auto v = ((1U << 2) - 1) & (word0 >> 0); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR regioncfg_r& set_region0(NPU_NAMESPACE::mem_attr value) + { + word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::mem_attr get_region1() const + { + auto v = ((1U << 2) - 1) & (word0 >> 2); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR regioncfg_r& set_region1(NPU_NAMESPACE::mem_attr value) + { + word0 = (~(((1U << 2) - 1)<<2) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 2); + return *this; + } + CONSTEXPR NPU_NAMESPACE::mem_attr get_region2() const + { + auto v = ((1U << 2) - 1) & (word0 >> 4); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR regioncfg_r& set_region2(NPU_NAMESPACE::mem_attr value) + { + word0 = (~(((1U << 2) - 1)<<4) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 4); + return *this; + } + CONSTEXPR NPU_NAMESPACE::mem_attr get_region3() const + { + auto v = ((1U << 2) - 1) & (word0 >> 6); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR regioncfg_r& set_region3(NPU_NAMESPACE::mem_attr value) + { + word0 = (~(((1U << 2) - 1)<<6) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 6); + return *this; + } + CONSTEXPR NPU_NAMESPACE::mem_attr get_region4() const + { + auto v = ((1U << 2) - 1) & (word0 >> 8); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR regioncfg_r& set_region4(NPU_NAMESPACE::mem_attr value) + { + word0 = (~(((1U << 2) - 1)<<8) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 8); + return *this; + } + CONSTEXPR NPU_NAMESPACE::mem_attr get_region5() const + { + auto v = ((1U << 2) - 1) & (word0 >> 10); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR regioncfg_r& set_region5(NPU_NAMESPACE::mem_attr value) + { + word0 = (~(((1U << 2) - 1)<<10) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 10); + return *this; + } + CONSTEXPR NPU_NAMESPACE::mem_attr get_region6() const + { + auto v = ((1U << 2) - 1) & (word0 >> 12); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR regioncfg_r& set_region6(NPU_NAMESPACE::mem_attr value) + { + word0 = (~(((1U << 2) - 1)<<12) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 12); + return *this; + } + CONSTEXPR NPU_NAMESPACE::mem_attr get_region7() const + { + auto v = ((1U << 2) - 1) & (word0 >> 14); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR regioncfg_r& set_region7(NPU_NAMESPACE::mem_attr value) + { + word0 = (~(((1U << 2) - 1)<<14) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 14); + return *this; + } +#endif +}; + + +struct axi_limit0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t max_beats : 2; + uint32_t reserved0 : 2; + uint32_t memtype : 4; + uint32_t reserved1 : 8; + uint32_t max_outstanding_read_m1 : 6; + uint32_t reserved2 : 2; + uint32_t max_outstanding_write_m1 : 5; + uint32_t reserved3 : 3; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR axi_limit0_r() : + word0(0) + {} + CONSTEXPR axi_limit0_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + axi_limit0_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const + { + auto v = ((1U << 2) - 1) & (word0 >> 0); + assert(v <= 2); + return static_cast(v); + } + CONSTEXPR axi_limit0_r& set_max_beats(NPU_NAMESPACE::max_beats value) + { + word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::axi_mem_encoding get_memtype() const + { + auto v = ((1U << 4) - 1) & (word0 >> 4); + assert(v <= 11); + return static_cast(v); + } + CONSTEXPR axi_limit0_r& set_memtype(NPU_NAMESPACE::axi_mem_encoding value) + { + word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & static_cast(value)) << 4); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_read_m1() const + { + auto v = ((1U << 6) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR axi_limit0_r& set_max_outstanding_read_m1(uint32_t value) + { + word0 = (~(((1U << 6) - 1)<<16) & word0) | ((((1U << 6) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_write_m1() const + { + auto v = ((1U << 5) - 1) & (word0 >> 24); + return v; + } + CONSTEXPR axi_limit0_r& set_max_outstanding_write_m1(uint32_t value) + { + word0 = (~(((1U << 5) - 1)<<24) & word0) | ((((1U << 5) - 1) & value) << 24); + return *this; + } +#endif +}; + + +struct axi_limit1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t max_beats : 2; + uint32_t reserved0 : 2; + uint32_t memtype : 4; + uint32_t reserved1 : 8; + uint32_t max_outstanding_read_m1 : 6; + uint32_t reserved2 : 2; + uint32_t max_outstanding_write_m1 : 5; + uint32_t reserved3 : 3; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR axi_limit1_r() : + word0(0) + {} + CONSTEXPR axi_limit1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + axi_limit1_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const + { + auto v = ((1U << 2) - 1) & (word0 >> 0); + assert(v <= 2); + return static_cast(v); + } + CONSTEXPR axi_limit1_r& set_max_beats(NPU_NAMESPACE::max_beats value) + { + word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::axi_mem_encoding get_memtype() const + { + auto v = ((1U << 4) - 1) & (word0 >> 4); + assert(v <= 11); + return static_cast(v); + } + CONSTEXPR axi_limit1_r& set_memtype(NPU_NAMESPACE::axi_mem_encoding value) + { + word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & static_cast(value)) << 4); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_read_m1() const + { + auto v = ((1U << 6) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR axi_limit1_r& set_max_outstanding_read_m1(uint32_t value) + { + word0 = (~(((1U << 6) - 1)<<16) & word0) | ((((1U << 6) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_write_m1() const + { + auto v = ((1U << 5) - 1) & (word0 >> 24); + return v; + } + CONSTEXPR axi_limit1_r& set_max_outstanding_write_m1(uint32_t value) + { + word0 = (~(((1U << 5) - 1)<<24) & word0) | ((((1U << 5) - 1) & value) << 24); + return *this; + } +#endif +}; + + +struct axi_limit2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t max_beats : 2; + uint32_t reserved0 : 2; + uint32_t memtype : 4; + uint32_t reserved1 : 8; + uint32_t max_outstanding_read_m1 : 6; + uint32_t reserved2 : 2; + uint32_t max_outstanding_write_m1 : 5; + uint32_t reserved3 : 3; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR axi_limit2_r() : + word0(0) + {} + CONSTEXPR axi_limit2_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + axi_limit2_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const + { + auto v = ((1U << 2) - 1) & (word0 >> 0); + assert(v <= 2); + return static_cast(v); + } + CONSTEXPR axi_limit2_r& set_max_beats(NPU_NAMESPACE::max_beats value) + { + word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::axi_mem_encoding get_memtype() const + { + auto v = ((1U << 4) - 1) & (word0 >> 4); + assert(v <= 11); + return static_cast(v); + } + CONSTEXPR axi_limit2_r& set_memtype(NPU_NAMESPACE::axi_mem_encoding value) + { + word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & static_cast(value)) << 4); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_read_m1() const + { + auto v = ((1U << 6) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR axi_limit2_r& set_max_outstanding_read_m1(uint32_t value) + { + word0 = (~(((1U << 6) - 1)<<16) & word0) | ((((1U << 6) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_write_m1() const + { + auto v = ((1U << 5) - 1) & (word0 >> 24); + return v; + } + CONSTEXPR axi_limit2_r& set_max_outstanding_write_m1(uint32_t value) + { + word0 = (~(((1U << 5) - 1)<<24) & word0) | ((((1U << 5) - 1) & value) << 24); + return *this; + } +#endif +}; + + +struct axi_limit3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t max_beats : 2; + uint32_t reserved0 : 2; + uint32_t memtype : 4; + uint32_t reserved1 : 8; + uint32_t max_outstanding_read_m1 : 6; + uint32_t reserved2 : 2; + uint32_t max_outstanding_write_m1 : 5; + uint32_t reserved3 : 3; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR axi_limit3_r() : + word0(0) + {} + CONSTEXPR axi_limit3_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + axi_limit3_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const + { + auto v = ((1U << 2) - 1) & (word0 >> 0); + assert(v <= 2); + return static_cast(v); + } + CONSTEXPR axi_limit3_r& set_max_beats(NPU_NAMESPACE::max_beats value) + { + word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::axi_mem_encoding get_memtype() const + { + auto v = ((1U << 4) - 1) & (word0 >> 4); + assert(v <= 11); + return static_cast(v); + } + CONSTEXPR axi_limit3_r& set_memtype(NPU_NAMESPACE::axi_mem_encoding value) + { + word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & static_cast(value)) << 4); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_read_m1() const + { + auto v = ((1U << 6) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR axi_limit3_r& set_max_outstanding_read_m1(uint32_t value) + { + word0 = (~(((1U << 6) - 1)<<16) & word0) | ((((1U << 6) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_write_m1() const + { + auto v = ((1U << 5) - 1) & (word0 >> 24); + return v; + } + CONSTEXPR axi_limit3_r& set_max_outstanding_write_m1(uint32_t value) + { + word0 = (~(((1U << 5) - 1)<<24) & word0) | ((((1U << 5) - 1) & value) << 24); + return *this; + } +#endif +}; + + +struct basep_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR basep_r() : + word0(0), + word1(0) + {} + CONSTEXPR basep_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + basep_r copy() + { + return *this; + } +#endif +}; + + +struct wd_status_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t core_slice_state : 2; + uint32_t core_idle : 1; + uint32_t ctrl_state : 2; + uint32_t ctrl_idle : 1; + uint32_t write_buf_index0 : 3; + uint32_t write_buf_valid0 : 1; + uint32_t write_buf_idle0 : 1; + uint32_t write_buf_index1 : 3; + uint32_t write_buf_valid1 : 1; + uint32_t write_buf_idle1 : 1; + uint32_t events : 12; + uint32_t reserved0 : 4; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR wd_status_r() : + word0(0) + {} + CONSTEXPR wd_status_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + wd_status_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::wd_core_slice_state get_core_slice_state() const + { + auto v = ((1U << 2) - 1) & (word0 >> 0); + assert(v <= 2); + return static_cast(v); + } + CONSTEXPR wd_status_r& set_core_slice_state(NPU_NAMESPACE::wd_core_slice_state value) + { + word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR uint32_t get_core_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR wd_status_r& set_core_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR NPU_NAMESPACE::wd_ctrl_state get_ctrl_state() const + { + auto v = ((1U << 2) - 1) & (word0 >> 3); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR wd_status_r& set_ctrl_state(NPU_NAMESPACE::wd_ctrl_state value) + { + word0 = (~(((1U << 2) - 1)<<3) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 3); + return *this; + } + CONSTEXPR uint32_t get_ctrl_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR wd_status_r& set_ctrl_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_write_buf_index0() const + { + auto v = ((1U << 3) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR wd_status_r& set_write_buf_index0(uint32_t value) + { + word0 = (~(((1U << 3) - 1)<<6) & word0) | ((((1U << 3) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_write_buf_valid0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 9); + return v; + } + CONSTEXPR wd_status_r& set_write_buf_valid0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9); + return *this; + } + CONSTEXPR uint32_t get_write_buf_idle0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR wd_status_r& set_write_buf_idle0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10); + return *this; + } + CONSTEXPR uint32_t get_write_buf_index1() const + { + auto v = ((1U << 3) - 1) & (word0 >> 11); + return v; + } + CONSTEXPR wd_status_r& set_write_buf_index1(uint32_t value) + { + word0 = (~(((1U << 3) - 1)<<11) & word0) | ((((1U << 3) - 1) & value) << 11); + return *this; + } + CONSTEXPR uint32_t get_write_buf_valid1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 14); + return v; + } + CONSTEXPR wd_status_r& set_write_buf_valid1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14); + return *this; + } + CONSTEXPR uint32_t get_write_buf_idle1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 15); + return v; + } + CONSTEXPR wd_status_r& set_write_buf_idle1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & value) << 15); + return *this; + } + CONSTEXPR uint32_t get_events() const + { + auto v = ((1U << 12) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR wd_status_r& set_events(uint32_t value) + { + word0 = (~(((1U << 12) - 1)<<16) & word0) | ((((1U << 12) - 1) & value) << 16); + return *this; + } +#endif +}; + + +struct mac_status_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t block_cfg_valid : 1; + uint32_t trav_en : 1; + uint32_t wait_for_ib : 1; + uint32_t wait_for_acc_buf : 1; + uint32_t wait_for_weights : 1; + uint32_t stall_stripe : 1; + uint32_t dw_sel : 1; + uint32_t wait_for_dw0_ready : 1; + uint32_t wait_for_dw1_ready : 1; + uint32_t acc_buf_sel_ai : 1; + uint32_t wait_for_acc0_ready : 1; + uint32_t wait_for_acc1_ready : 1; + uint32_t acc_buf_sel_aa : 1; + uint32_t acc0_valid : 1; + uint32_t acc1_valid : 1; + uint32_t reserved0 : 1; + uint32_t events : 11; + uint32_t reserved1 : 5; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR mac_status_r() : + word0(0) + {} + CONSTEXPR mac_status_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + mac_status_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_block_cfg_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR mac_status_r& set_block_cfg_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_trav_en() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR mac_status_r& set_trav_en(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_wait_for_ib() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_ib(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_wait_for_acc_buf() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_acc_buf(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_wait_for_weights() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_weights(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_stall_stripe() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR mac_status_r& set_stall_stripe(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_dw_sel() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR mac_status_r& set_dw_sel(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_wait_for_dw0_ready() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_dw0_ready(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_wait_for_dw1_ready() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_dw1_ready(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_acc_buf_sel_ai() const + { + auto v = ((1U << 1) - 1) & (word0 >> 9); + return v; + } + CONSTEXPR mac_status_r& set_acc_buf_sel_ai(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9); + return *this; + } + CONSTEXPR uint32_t get_wait_for_acc0_ready() const + { + auto v = ((1U << 1) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_acc0_ready(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10); + return *this; + } + CONSTEXPR uint32_t get_wait_for_acc1_ready() const + { + auto v = ((1U << 1) - 1) & (word0 >> 11); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_acc1_ready(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11); + return *this; + } + CONSTEXPR uint32_t get_acc_buf_sel_aa() const + { + auto v = ((1U << 1) - 1) & (word0 >> 12); + return v; + } + CONSTEXPR mac_status_r& set_acc_buf_sel_aa(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12); + return *this; + } + CONSTEXPR uint32_t get_acc0_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 13); + return v; + } + CONSTEXPR mac_status_r& set_acc0_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13); + return *this; + } + CONSTEXPR uint32_t get_acc1_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 14); + return v; + } + CONSTEXPR mac_status_r& set_acc1_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14); + return *this; + } + CONSTEXPR uint32_t get_events() const + { + auto v = ((1U << 11) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR mac_status_r& set_events(uint32_t value) + { + word0 = (~(((1U << 11) - 1)<<16) & word0) | ((((1U << 11) - 1) & value) << 16); + return *this; + } +#endif +}; + + +struct ao_status_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t cmd_sbw_valid : 1; + uint32_t cmd_act_valid : 1; + uint32_t cmd_ctl_valid : 1; + uint32_t cmd_scl_valid : 1; + uint32_t cmd_sbr_valid : 1; + uint32_t cmd_ofm_valid : 1; + uint32_t blk_cmd_ready : 1; + uint32_t blk_cmd_valid : 1; + uint32_t reserved0 : 8; + uint32_t events : 8; + uint32_t reserved1 : 8; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ao_status_r() : + word0(0) + {} + CONSTEXPR ao_status_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ao_status_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_cmd_sbw_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR ao_status_r& set_cmd_sbw_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_cmd_act_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR ao_status_r& set_cmd_act_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_cmd_ctl_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR ao_status_r& set_cmd_ctl_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_cmd_scl_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR ao_status_r& set_cmd_scl_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_cmd_sbr_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR ao_status_r& set_cmd_sbr_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_cmd_ofm_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR ao_status_r& set_cmd_ofm_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_blk_cmd_ready() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR ao_status_r& set_blk_cmd_ready(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_blk_cmd_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR ao_status_r& set_blk_cmd_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_events() const + { + auto v = ((1U << 8) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR ao_status_r& set_events(uint32_t value) + { + word0 = (~(((1U << 8) - 1)<<16) & word0) | ((((1U << 8) - 1) & value) << 16); + return *this; + } +#endif +}; + + +struct dma_status0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t cmd_idle : 1; + uint32_t ifm_idle : 1; + uint32_t wgt_idle_c0 : 1; + uint32_t bas_idle_c0 : 1; + uint32_t m2m_idle : 1; + uint32_t ofm_idle : 1; + uint32_t halt_req : 1; + uint32_t halt_ack : 1; + uint32_t pause_req : 1; + uint32_t pause_ack : 1; + uint32_t ib0_ai_valid_c0 : 1; + uint32_t ib0_ai_ready_c0 : 1; + uint32_t ib1_ai_valid_c0 : 1; + uint32_t ib1_ai_ready_c0 : 1; + uint32_t ib0_ao_valid_c0 : 1; + uint32_t ib0_ao_ready_c0 : 1; + uint32_t ib1_ao_valid_c0 : 1; + uint32_t ib1_ao_ready_c0 : 1; + uint32_t ob0_valid_c0 : 1; + uint32_t ob0_ready_c0 : 1; + uint32_t ob1_valid_c0 : 1; + uint32_t ob1_ready_c0 : 1; + uint32_t cmd_valid : 1; + uint32_t cmd_ready : 1; + uint32_t wd_bitstream_valid_c0 : 1; + uint32_t wd_bitstream_ready_c0 : 1; + uint32_t bs_bitstream_valid_c0 : 1; + uint32_t bs_bitstream_ready_c0 : 1; + uint32_t axi0_ar_stalled : 1; + uint32_t axi0_rd_limit_stall : 1; + uint32_t axi0_aw_stalled : 1; + uint32_t axi0_w_stalled : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma_status0_r() : + word0(0) + {} + CONSTEXPR dma_status0_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma_status0_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_cmd_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR dma_status0_r& set_cmd_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_ifm_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR dma_status0_r& set_ifm_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_wgt_idle_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR dma_status0_r& set_wgt_idle_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_bas_idle_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR dma_status0_r& set_bas_idle_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_m2m_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR dma_status0_r& set_m2m_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_ofm_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR dma_status0_r& set_ofm_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_halt_req() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR dma_status0_r& set_halt_req(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_halt_ack() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR dma_status0_r& set_halt_ack(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_pause_req() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR dma_status0_r& set_pause_req(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_pause_ack() const + { + auto v = ((1U << 1) - 1) & (word0 >> 9); + return v; + } + CONSTEXPR dma_status0_r& set_pause_ack(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9); + return *this; + } + CONSTEXPR uint32_t get_ib0_ai_valid_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR dma_status0_r& set_ib0_ai_valid_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10); + return *this; + } + CONSTEXPR uint32_t get_ib0_ai_ready_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 11); + return v; + } + CONSTEXPR dma_status0_r& set_ib0_ai_ready_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11); + return *this; + } + CONSTEXPR uint32_t get_ib1_ai_valid_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 12); + return v; + } + CONSTEXPR dma_status0_r& set_ib1_ai_valid_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12); + return *this; + } + CONSTEXPR uint32_t get_ib1_ai_ready_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 13); + return v; + } + CONSTEXPR dma_status0_r& set_ib1_ai_ready_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13); + return *this; + } + CONSTEXPR uint32_t get_ib0_ao_valid_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 14); + return v; + } + CONSTEXPR dma_status0_r& set_ib0_ao_valid_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14); + return *this; + } + CONSTEXPR uint32_t get_ib0_ao_ready_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 15); + return v; + } + CONSTEXPR dma_status0_r& set_ib0_ao_ready_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & value) << 15); + return *this; + } + CONSTEXPR uint32_t get_ib1_ao_valid_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR dma_status0_r& set_ib1_ao_valid_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<16) & word0) | ((((1U << 1) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_ib1_ao_ready_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 17); + return v; + } + CONSTEXPR dma_status0_r& set_ib1_ao_ready_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<17) & word0) | ((((1U << 1) - 1) & value) << 17); + return *this; + } + CONSTEXPR uint32_t get_ob0_valid_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 18); + return v; + } + CONSTEXPR dma_status0_r& set_ob0_valid_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<18) & word0) | ((((1U << 1) - 1) & value) << 18); + return *this; + } + CONSTEXPR uint32_t get_ob0_ready_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 19); + return v; + } + CONSTEXPR dma_status0_r& set_ob0_ready_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<19) & word0) | ((((1U << 1) - 1) & value) << 19); + return *this; + } + CONSTEXPR uint32_t get_ob1_valid_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 20); + return v; + } + CONSTEXPR dma_status0_r& set_ob1_valid_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<20) & word0) | ((((1U << 1) - 1) & value) << 20); + return *this; + } + CONSTEXPR uint32_t get_ob1_ready_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 21); + return v; + } + CONSTEXPR dma_status0_r& set_ob1_ready_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<21) & word0) | ((((1U << 1) - 1) & value) << 21); + return *this; + } + CONSTEXPR uint32_t get_cmd_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 22); + return v; + } + CONSTEXPR dma_status0_r& set_cmd_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<22) & word0) | ((((1U << 1) - 1) & value) << 22); + return *this; + } + CONSTEXPR uint32_t get_cmd_ready() const + { + auto v = ((1U << 1) - 1) & (word0 >> 23); + return v; + } + CONSTEXPR dma_status0_r& set_cmd_ready(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<23) & word0) | ((((1U << 1) - 1) & value) << 23); + return *this; + } + CONSTEXPR uint32_t get_wd_bitstream_valid_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 24); + return v; + } + CONSTEXPR dma_status0_r& set_wd_bitstream_valid_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<24) & word0) | ((((1U << 1) - 1) & value) << 24); + return *this; + } + CONSTEXPR uint32_t get_wd_bitstream_ready_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 25); + return v; + } + CONSTEXPR dma_status0_r& set_wd_bitstream_ready_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<25) & word0) | ((((1U << 1) - 1) & value) << 25); + return *this; + } + CONSTEXPR uint32_t get_bs_bitstream_valid_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 26); + return v; + } + CONSTEXPR dma_status0_r& set_bs_bitstream_valid_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<26) & word0) | ((((1U << 1) - 1) & value) << 26); + return *this; + } + CONSTEXPR uint32_t get_bs_bitstream_ready_c0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 27); + return v; + } + CONSTEXPR dma_status0_r& set_bs_bitstream_ready_c0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<27) & word0) | ((((1U << 1) - 1) & value) << 27); + return *this; + } + CONSTEXPR uint32_t get_axi0_ar_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 28); + return v; + } + CONSTEXPR dma_status0_r& set_axi0_ar_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<28) & word0) | ((((1U << 1) - 1) & value) << 28); + return *this; + } + CONSTEXPR uint32_t get_axi0_rd_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 29); + return v; + } + CONSTEXPR dma_status0_r& set_axi0_rd_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<29) & word0) | ((((1U << 1) - 1) & value) << 29); + return *this; + } + CONSTEXPR uint32_t get_axi0_aw_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 30); + return v; + } + CONSTEXPR dma_status0_r& set_axi0_aw_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<30) & word0) | ((((1U << 1) - 1) & value) << 30); + return *this; + } + CONSTEXPR uint32_t get_axi0_w_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR dma_status0_r& set_axi0_w_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct dma_status1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t axi0_wr_limit_stall : 1; + uint32_t axi1_ar_stalled : 1; + uint32_t axi1_rd_limit_stall : 1; + uint32_t axi1_wr_stalled : 1; + uint32_t axi1_w_stalled : 1; + uint32_t axi1_wr_limit_stall : 1; + uint32_t wgt_idle_c1 : 1; + uint32_t bas_idle_c1 : 1; + uint32_t ib0_ai_valid_c1 : 1; + uint32_t ib0_ai_ready_c1 : 1; + uint32_t ib1_ai_valid_c1 : 1; + uint32_t ib1_ai_ready_c1 : 1; + uint32_t ib0_ao_valid_c1 : 1; + uint32_t ib0_ao_ready_c1 : 1; + uint32_t ib1_ao_valid_c1 : 1; + uint32_t ib1_ao_ready_c1 : 1; + uint32_t ob0_valid_c1 : 1; + uint32_t ob0_ready_c1 : 1; + uint32_t ob1_valid_c1 : 1; + uint32_t ob1_ready_c1 : 1; + uint32_t wd_bitstream_valid_c1 : 1; + uint32_t wd_bitstream_ready_c1 : 1; + uint32_t bs_bitstream_valid_c1 : 1; + uint32_t bs_bitstream_ready_c1 : 1; + uint32_t reserved0 : 8; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma_status1_r() : + word0(0) + {} + CONSTEXPR dma_status1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma_status1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_axi0_wr_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR dma_status1_r& set_axi0_wr_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_axi1_ar_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR dma_status1_r& set_axi1_ar_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_axi1_rd_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR dma_status1_r& set_axi1_rd_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_axi1_wr_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR dma_status1_r& set_axi1_wr_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_axi1_w_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR dma_status1_r& set_axi1_w_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_axi1_wr_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR dma_status1_r& set_axi1_wr_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_wgt_idle_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR dma_status1_r& set_wgt_idle_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_bas_idle_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR dma_status1_r& set_bas_idle_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_ib0_ai_valid_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR dma_status1_r& set_ib0_ai_valid_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_ib0_ai_ready_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 9); + return v; + } + CONSTEXPR dma_status1_r& set_ib0_ai_ready_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9); + return *this; + } + CONSTEXPR uint32_t get_ib1_ai_valid_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR dma_status1_r& set_ib1_ai_valid_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10); + return *this; + } + CONSTEXPR uint32_t get_ib1_ai_ready_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 11); + return v; + } + CONSTEXPR dma_status1_r& set_ib1_ai_ready_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11); + return *this; + } + CONSTEXPR uint32_t get_ib0_ao_valid_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 12); + return v; + } + CONSTEXPR dma_status1_r& set_ib0_ao_valid_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12); + return *this; + } + CONSTEXPR uint32_t get_ib0_ao_ready_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 13); + return v; + } + CONSTEXPR dma_status1_r& set_ib0_ao_ready_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13); + return *this; + } + CONSTEXPR uint32_t get_ib1_ao_valid_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 14); + return v; + } + CONSTEXPR dma_status1_r& set_ib1_ao_valid_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14); + return *this; + } + CONSTEXPR uint32_t get_ib1_ao_ready_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 15); + return v; + } + CONSTEXPR dma_status1_r& set_ib1_ao_ready_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & value) << 15); + return *this; + } + CONSTEXPR uint32_t get_ob0_valid_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR dma_status1_r& set_ob0_valid_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<16) & word0) | ((((1U << 1) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_ob0_ready_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 17); + return v; + } + CONSTEXPR dma_status1_r& set_ob0_ready_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<17) & word0) | ((((1U << 1) - 1) & value) << 17); + return *this; + } + CONSTEXPR uint32_t get_ob1_valid_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 18); + return v; + } + CONSTEXPR dma_status1_r& set_ob1_valid_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<18) & word0) | ((((1U << 1) - 1) & value) << 18); + return *this; + } + CONSTEXPR uint32_t get_ob1_ready_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 19); + return v; + } + CONSTEXPR dma_status1_r& set_ob1_ready_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<19) & word0) | ((((1U << 1) - 1) & value) << 19); + return *this; + } + CONSTEXPR uint32_t get_wd_bitstream_valid_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 20); + return v; + } + CONSTEXPR dma_status1_r& set_wd_bitstream_valid_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<20) & word0) | ((((1U << 1) - 1) & value) << 20); + return *this; + } + CONSTEXPR uint32_t get_wd_bitstream_ready_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 21); + return v; + } + CONSTEXPR dma_status1_r& set_wd_bitstream_ready_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<21) & word0) | ((((1U << 1) - 1) & value) << 21); + return *this; + } + CONSTEXPR uint32_t get_bs_bitstream_valid_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 22); + return v; + } + CONSTEXPR dma_status1_r& set_bs_bitstream_valid_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<22) & word0) | ((((1U << 1) - 1) & value) << 22); + return *this; + } + CONSTEXPR uint32_t get_bs_bitstream_ready_c1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 23); + return v; + } + CONSTEXPR dma_status1_r& set_bs_bitstream_ready_c1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<23) & word0) | ((((1U << 1) - 1) & value) << 23); + return *this; + } +#endif +}; + + +struct clkforce_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t top_level_clk : 1; + uint32_t cc_clk : 1; + uint32_t dma_clk : 1; + uint32_t mac_clk : 1; + uint32_t ao_clk : 1; + uint32_t wd_clk : 1; + uint32_t reserved0 : 26; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR clkforce_r() : + word0(0) + {} + CONSTEXPR clkforce_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + clkforce_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_top_level_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR clkforce_r& set_top_level_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_cc_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR clkforce_r& set_cc_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_dma_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR clkforce_r& set_dma_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_mac_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR clkforce_r& set_mac_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_ao_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR clkforce_r& set_ao_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_wd_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR clkforce_r& set_wd_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } +#endif +}; + + +struct debug_address_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t addr : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR debug_address_r() : + word0(0) + {} + CONSTEXPR debug_address_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + debug_address_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_addr() const + { + auto v = word0; + return v; + } + CONSTEXPR debug_address_r& set_addr(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct debug_misc_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t misc : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR debug_misc_r() : + word0(0) + {} + CONSTEXPR debug_misc_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + debug_misc_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_misc() const + { + auto v = word0; + return v; + } + CONSTEXPR debug_misc_r& set_misc(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct debugcore_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t core : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR debugcore_r() : + word0(0) + {} + CONSTEXPR debugcore_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + debugcore_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_core() const + { + auto v = word0; + return v; + } + CONSTEXPR debugcore_r& set_core(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct debug_block_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t block : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR debug_block_r() : + word0(0) + {} + CONSTEXPR debug_block_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + debug_block_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_block() const + { + auto v = word0; + return v; + } + CONSTEXPR debug_block_r& set_block(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pmcr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t cnt_en : 1; + uint32_t event_cnt_rst : 1; + uint32_t cycle_cnt_rst : 1; + uint32_t mask_en : 1; + uint32_t reserved0 : 7; + uint32_t num_event_cnt : 5; + uint32_t reserved1 : 16; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmcr_r() : + word0(8192) + {} + CONSTEXPR pmcr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmcr_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_cnt_en() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmcr_r& set_cnt_en(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_event_cnt_rst() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmcr_r& set_event_cnt_rst(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_cycle_cnt_rst() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmcr_r& set_cycle_cnt_rst(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_mask_en() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmcr_r& set_mask_en(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_num_event_cnt() const + { + auto v = ((1U << 5) - 1) & (word0 >> 11); + return v; + } + CONSTEXPR pmcr_r& set_num_event_cnt(uint32_t value) + { + word0 = (~(((1U << 5) - 1)<<11) & word0) | ((((1U << 5) - 1) & value) << 11); + return *this; + } +#endif +}; + + +struct pmcntenset_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0 : 1; + uint32_t EVENT_CNT_1 : 1; + uint32_t EVENT_CNT_2 : 1; + uint32_t EVENT_CNT_3 : 1; + uint32_t reserved0 : 27; + uint32_t CYCLE_CNT : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmcntenset_r() : + word0(0) + {} + CONSTEXPR pmcntenset_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmcntenset_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_2(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_3(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmcntenset_r& set_CYCLE_CNT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmcntenclr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0 : 1; + uint32_t EVENT_CNT_1 : 1; + uint32_t EVENT_CNT_2 : 1; + uint32_t EVENT_CNT_3 : 1; + uint32_t reserved0 : 27; + uint32_t CYCLE_CNT : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmcntenclr_r() : + word0(0) + {} + CONSTEXPR pmcntenclr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmcntenclr_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_2(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_3(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmcntenclr_r& set_CYCLE_CNT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmovsset_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0_OVF : 1; + uint32_t EVENT_CNT_1_OVF : 1; + uint32_t EVENT_CNT_2_OVF : 1; + uint32_t EVENT_CNT_3_OVF : 1; + uint32_t reserved0 : 27; + uint32_t CYCLE_CNT_OVF : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmovsset_r() : + word0(0) + {} + CONSTEXPR pmovsset_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmovsset_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_0_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_1_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_2_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_3_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmovsset_r& set_CYCLE_CNT_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmovsclr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0_OVF : 1; + uint32_t EVENT_CNT_1_OVF : 1; + uint32_t EVENT_CNT_2_OVF : 1; + uint32_t EVENT_CNT_3_OVF : 1; + uint32_t reserved0 : 27; + uint32_t CYCLE_CNT_OVF : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmovsclr_r() : + word0(0) + {} + CONSTEXPR pmovsclr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmovsclr_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_0_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_1_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_2_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_3_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmovsclr_r& set_CYCLE_CNT_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmintset_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0_INT : 1; + uint32_t EVENT_CNT_1_INT : 1; + uint32_t EVENT_CNT_2_INT : 1; + uint32_t EVENT_CNT_3_INT : 1; + uint32_t reserved0 : 27; + uint32_t CYCLE_CNT_INT : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmintset_r() : + word0(0) + {} + CONSTEXPR pmintset_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmintset_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_0_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_1_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_2_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_3_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmintset_r& set_CYCLE_CNT_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmintclr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0_INT : 1; + uint32_t EVENT_CNT_1_INT : 1; + uint32_t EVENT_CNT_2_INT : 1; + uint32_t EVENT_CNT_3_INT : 1; + uint32_t reserved0 : 27; + uint32_t CYCLE_CNT_INT : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmintclr_r() : + word0(0) + {} + CONSTEXPR pmintclr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmintclr_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_0_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_1_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_2_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_3_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmintclr_r& set_CYCLE_CNT_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmccntr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CYCLE_CNT_LO : 32; + uint32_t CYCLE_CNT_HI : 16; + uint32_t reserved0 : 16; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR pmccntr_r() : + word0(0), + word1(0) + {} + CONSTEXPR pmccntr_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + pmccntr_r copy() + { + return *this; + } +#endif +}; + + +struct pmccntr_cfg_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CYCLE_CNT_CFG_START : 10; + uint32_t reserved0 : 6; + uint32_t CYCLE_CNT_CFG_STOP : 10; + uint32_t reserved1 : 6; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmccntr_cfg_r() : + word0(0) + {} + CONSTEXPR pmccntr_cfg_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmccntr_cfg_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_event get_CYCLE_CNT_CFG_START() const + { + auto v = ((1U << 10) - 1) & (word0 >> 0); + assert(v <= 433); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_CYCLE_CNT_CFG_START(NPU_NAMESPACE::pmu_event value) + { + word0 = (~(((1U << 10) - 1)<<0) & word0) | ((((1U << 10) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_event get_CYCLE_CNT_CFG_STOP() const + { + auto v = ((1U << 10) - 1) & (word0 >> 16); + assert(v <= 433); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_CYCLE_CNT_CFG_STOP(NPU_NAMESPACE::pmu_event value) + { + word0 = (~(((1U << 10) - 1)<<16) & word0) | ((((1U << 10) - 1) & static_cast(value)) << 16); + return *this; + } +#endif +}; + + +struct pmcaxi_chan_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CH_SEL : 4; + uint32_t reserved0 : 4; + uint32_t AXI_CNT_SEL : 2; + uint32_t BW_CH_SEL_EN : 1; + uint32_t reserved1 : 21; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmcaxi_chan_r() : + word0(0) + {} + CONSTEXPR pmcaxi_chan_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmcaxi_chan_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_axi_channel get_CH_SEL() const + { + auto v = ((1U << 4) - 1) & (word0 >> 0); + assert(v <= 9); + return static_cast(v); + } + CONSTEXPR pmcaxi_chan_r& set_CH_SEL(NPU_NAMESPACE::pmu_axi_channel value) + { + word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR uint32_t get_AXI_CNT_SEL() const + { + auto v = ((1U << 2) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR pmcaxi_chan_r& set_AXI_CNT_SEL(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<8) & word0) | ((((1U << 2) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_BW_CH_SEL_EN() const + { + auto v = ((1U << 1) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR pmcaxi_chan_r& set_BW_CH_SEL_EN(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10); + return *this; + } +#endif +}; + + +struct kernel_x_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR kernel_x_r() : + word0(0) + {} + CONSTEXPR kernel_x_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + kernel_x_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR kernel_x_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct kernel_y_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR kernel_y_r() : + word0(0) + {} + CONSTEXPR kernel_y_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + kernel_y_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR kernel_y_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct kernel_w_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR kernel_w_m1_r() : + word0(0) + {} + CONSTEXPR kernel_w_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + kernel_w_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR kernel_w_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct kernel_h_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR kernel_h_m1_r() : + word0(0) + {} + CONSTEXPR kernel_h_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + kernel_h_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR kernel_h_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_cblk_width_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_cblk_width_m1_r() : + word0(0) + {} + CONSTEXPR ofm_cblk_width_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_cblk_width_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_cblk_width_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_cblk_height_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_cblk_height_m1_r() : + word0(0) + {} + CONSTEXPR ofm_cblk_height_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_cblk_height_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_cblk_height_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_cblk_depth_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_cblk_depth_m1_r() : + word0(0) + {} + CONSTEXPR ofm_cblk_depth_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_cblk_depth_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_cblk_depth_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_cblk_depth_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_cblk_depth_m1_r() : + word0(0) + {} + CONSTEXPR ifm_cblk_depth_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_cblk_depth_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_cblk_depth_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_x_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_x_r() : + word0(0) + {} + CONSTEXPR ofm_x_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_x_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_x_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_y_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_y_r() : + word0(0) + {} + CONSTEXPR ofm_y_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_y_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_y_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_z_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_z_r() : + word0(0) + {} + CONSTEXPR ofm_z_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_z_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_z_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_z_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_z_r() : + word0(0) + {} + CONSTEXPR ifm_z_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_z_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_z_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pad_top_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pad_top_r() : + word0(0) + {} + CONSTEXPR pad_top_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pad_top_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR pad_top_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pad_left_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pad_left_r() : + word0(0) + {} + CONSTEXPR pad_left_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pad_left_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR pad_left_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_cblk_width_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_cblk_width_r() : + word0(0) + {} + CONSTEXPR ifm_cblk_width_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_cblk_width_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_cblk_width_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_cblk_height_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_cblk_height_r() : + word0(0) + {} + CONSTEXPR ifm_cblk_height_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_cblk_height_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_cblk_height_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma_ifm_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_ifm_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_ifm_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_ifm_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_ifm_dst_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma_ifm_dst_r() : + word0(0) + {} + CONSTEXPR dma_ifm_dst_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma_ifm_dst_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma_ifm_dst_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma_ofm_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma_ofm_src_r() : + word0(0) + {} + CONSTEXPR dma_ofm_src_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma_ofm_src_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma_ofm_src_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma_ofm_dst_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_ofm_dst_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_ofm_dst_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_ofm_dst_r copy() + { + return *this; + } +#endif +}; + + +struct dma_weight_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_weight_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_weight_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_weight_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_cmd_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_cmd_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_cmd_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_cmd_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_cmd_size_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma_cmd_size_r() : + word0(0) + {} + CONSTEXPR dma_cmd_size_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma_cmd_size_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma_cmd_size_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma_m2m_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_m2m_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_m2m_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_m2m_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_m2m_dst_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_m2m_dst_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_m2m_dst_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_m2m_dst_r copy() + { + return *this; + } +#endif +}; + + +struct current_qread_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR current_qread_r() : + word0(0) + {} + CONSTEXPR current_qread_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + current_qread_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR current_qread_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma_scale_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_scale_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_scale_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_scale_src_r copy() + { + return *this; + } +#endif +}; + + +struct current_block_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR current_block_r() : + word0(0) + {} + CONSTEXPR current_block_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + current_block_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR current_block_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct current_op_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR current_op_r() : + word0(0) + {} + CONSTEXPR current_op_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + current_op_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR current_op_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct current_cmd_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR current_cmd_r() : + word0(0) + {} + CONSTEXPR current_cmd_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + current_cmd_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR current_cmd_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pmevcntr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t count : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmevcntr_r() : + word0(0) + {} + CONSTEXPR pmevcntr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmevcntr_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_count() const + { + auto v = word0; + return v; + } + CONSTEXPR pmevcntr_r& set_count(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pmevtyper_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EV_TYPE : 10; + uint32_t reserved0 : 22; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmevtyper_r() : + word0(0) + {} + CONSTEXPR pmevtyper_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmevtyper_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_event get_EV_TYPE() const + { + auto v = ((1U << 10) - 1) & (word0 >> 0); + assert(v <= 433); + return static_cast(v); + } + CONSTEXPR pmevtyper_r& set_EV_TYPE(NPU_NAMESPACE::pmu_event value) + { + word0 = (~(((1U << 10) - 1)<<0) & word0) | ((((1U << 10) - 1) & static_cast(value)) << 0); + return *this; + } +#endif +}; + + +struct shared_buffer_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t mem_word : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR shared_buffer_r() : + word0(0) + {} + CONSTEXPR shared_buffer_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + shared_buffer_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_mem_word() const + { + auto v = word0; + return v; + } + CONSTEXPR shared_buffer_r& set_mem_word(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_pad_top_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_pad_top_r() : + word0(0) + {} + CONSTEXPR ifm_pad_top_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_pad_top_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_pad_top_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_pad_left_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_pad_left_r() : + word0(0) + {} + CONSTEXPR ifm_pad_left_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_pad_left_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_pad_left_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_pad_right_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_pad_right_r() : + word0(0) + {} + CONSTEXPR ifm_pad_right_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_pad_right_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_pad_right_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_pad_bottom_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_pad_bottom_r() : + word0(0) + {} + CONSTEXPR ifm_pad_bottom_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_pad_bottom_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_pad_bottom_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_depth_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_depth_m1_r() : + word0(0) + {} + CONSTEXPR ifm_depth_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_depth_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_depth_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_precision_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_precision_r() : + word0(0) + {} + CONSTEXPR ifm_precision_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_precision_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_precision_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_upscale_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_upscale_r() : + word0(0) + {} + CONSTEXPR ifm_upscale_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_upscale_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_upscale_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_zero_point_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_zero_point_r() : + word0(0) + {} + CONSTEXPR ifm_zero_point_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_zero_point_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_zero_point_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_width0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_width0_m1_r() : + word0(0) + {} + CONSTEXPR ifm_width0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_width0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_width0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_height0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_height0_m1_r() : + word0(0) + {} + CONSTEXPR ifm_height0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_height0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_height0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_height1_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_height1_m1_r() : + word0(0) + {} + CONSTEXPR ifm_height1_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_height1_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_height1_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_ib_end_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_ib_end_r() : + word0(0) + {} + CONSTEXPR ifm_ib_end_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_ib_end_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_ib_end_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_region_r() : + word0(0) + {} + CONSTEXPR ifm_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_width_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_width_m1_r() : + word0(0) + {} + CONSTEXPR ofm_width_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_width_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_width_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_height_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_height_m1_r() : + word0(0) + {} + CONSTEXPR ofm_height_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_height_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_height_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_depth_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_depth_m1_r() : + word0(0) + {} + CONSTEXPR ofm_depth_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_depth_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_depth_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_precision_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_precision_r() : + word0(0) + {} + CONSTEXPR ofm_precision_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_precision_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_precision_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_blk_width_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_blk_width_m1_r() : + word0(0) + {} + CONSTEXPR ofm_blk_width_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_blk_width_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_blk_width_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_blk_height_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_blk_height_m1_r() : + word0(0) + {} + CONSTEXPR ofm_blk_height_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_blk_height_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_blk_height_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_blk_depth_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_blk_depth_m1_r() : + word0(0) + {} + CONSTEXPR ofm_blk_depth_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_blk_depth_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_blk_depth_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_zero_point_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_zero_point_r() : + word0(0) + {} + CONSTEXPR ofm_zero_point_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_zero_point_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_zero_point_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_width0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_width0_m1_r() : + word0(0) + {} + CONSTEXPR ofm_width0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_width0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_width0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_height0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_height0_m1_r() : + word0(0) + {} + CONSTEXPR ofm_height0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_height0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_height0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_height1_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_height1_m1_r() : + word0(0) + {} + CONSTEXPR ofm_height1_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_height1_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_height1_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_region_r() : + word0(0) + {} + CONSTEXPR ofm_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct kernel_width_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR kernel_width_m1_r() : + word0(0) + {} + CONSTEXPR kernel_width_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + kernel_width_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR kernel_width_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct kernel_height_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR kernel_height_m1_r() : + word0(0) + {} + CONSTEXPR kernel_height_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + kernel_height_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR kernel_height_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct kernel_stride_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR kernel_stride_r() : + word0(0) + {} + CONSTEXPR kernel_stride_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + kernel_stride_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR kernel_stride_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct parallel_mode_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR parallel_mode_r() : + word0(0) + {} + CONSTEXPR parallel_mode_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + parallel_mode_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR parallel_mode_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct acc_format_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR acc_format_r() : + word0(0) + {} + CONSTEXPR acc_format_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + acc_format_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR acc_format_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct activation_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR activation_r() : + word0(0) + {} + CONSTEXPR activation_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + activation_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR activation_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct activation_min_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR activation_min_r() : + word0(0) + {} + CONSTEXPR activation_min_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + activation_min_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR activation_min_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct activation_max_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR activation_max_r() : + word0(0) + {} + CONSTEXPR activation_max_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + activation_max_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR activation_max_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct weight_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR weight_region_r() : + word0(0) + {} + CONSTEXPR weight_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + weight_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR weight_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct scale_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR scale_region_r() : + word0(0) + {} + CONSTEXPR scale_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + scale_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR scale_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ab_start_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ab_start_r() : + word0(0) + {} + CONSTEXPR ab_start_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ab_start_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ab_start_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct blockdep_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR blockdep_r() : + word0(0) + {} + CONSTEXPR blockdep_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + blockdep_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR blockdep_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma0_src_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma0_src_region_r() : + word0(0) + {} + CONSTEXPR dma0_src_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma0_src_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma0_src_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma0_dst_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma0_dst_region_r() : + word0(0) + {} + CONSTEXPR dma0_dst_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma0_dst_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma0_dst_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma0_size0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma0_size0_r() : + word0(0) + {} + CONSTEXPR dma0_size0_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma0_size0_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma0_size0_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma0_size1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma0_size1_r() : + word0(0) + {} + CONSTEXPR dma0_size1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma0_size1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma0_size1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_broadcast_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_broadcast_r() : + word0(0) + {} + CONSTEXPR ifm2_broadcast_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_broadcast_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_broadcast_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_scalar_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_scalar_r() : + word0(0) + {} + CONSTEXPR ifm2_scalar_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_scalar_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_scalar_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_precision_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_precision_r() : + word0(0) + {} + CONSTEXPR ifm2_precision_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_precision_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_precision_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_zero_point_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_zero_point_r() : + word0(0) + {} + CONSTEXPR ifm2_zero_point_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_zero_point_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_zero_point_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_width0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_width0_m1_r() : + word0(0) + {} + CONSTEXPR ifm2_width0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_width0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_width0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_height0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_height0_m1_r() : + word0(0) + {} + CONSTEXPR ifm2_height0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_height0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_height0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_height1_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_height1_m1_r() : + word0(0) + {} + CONSTEXPR ifm2_height1_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_height1_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_height1_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_ib_start_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_ib_start_r() : + word0(0) + {} + CONSTEXPR ifm2_ib_start_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_ib_start_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_ib_start_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_region_r() : + word0(0) + {} + CONSTEXPR ifm2_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_base0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_base0_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_base0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_base0_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_base1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_base1_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_base1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_base1_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_base2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_base2_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_base2_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_base2_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_base3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_base3_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_base3_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_base3_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_stride_x_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_stride_x_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_stride_x_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_stride_x_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_stride_y_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_stride_y_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_stride_y_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_stride_y_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_stride_c_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_stride_c_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_stride_c_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_stride_c_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_base0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_base0_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_base0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_base0_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_base1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_base1_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_base1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_base1_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_base2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_base2_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_base2_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_base2_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_base3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_base3_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_base3_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_base3_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_stride_x_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_stride_x_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_stride_x_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_stride_x_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_stride_y_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_stride_y_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_stride_y_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_stride_y_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_stride_c_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_stride_c_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_stride_c_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_stride_c_r copy() + { + return *this; + } +#endif +}; + + +struct weight_base_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight_base_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight_base_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight_base_r copy() + { + return *this; + } +#endif +}; + + +struct weight_length_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight_length_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight_length_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight_length_r copy() + { + return *this; + } +#endif +}; + + +struct scale_base_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR scale_base_r() : + word0(0), + word1(0) + {} + CONSTEXPR scale_base_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + scale_base_r copy() + { + return *this; + } +#endif +}; + + +struct scale_length_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR scale_length_r() : + word0(0), + word1(0) + {} + CONSTEXPR scale_length_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + scale_length_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_scale_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_scale_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_scale_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_scale_r copy() + { + return *this; + } +#endif +}; + + +struct opa_scale_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR opa_scale_r() : + word0(0), + word1(0) + {} + CONSTEXPR opa_scale_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + opa_scale_r copy() + { + return *this; + } +#endif +}; + + +struct opb_scale_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR opb_scale_r() : + word0(0) + {} + CONSTEXPR opb_scale_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + opb_scale_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR opb_scale_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma0_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_dst_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_dst_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_dst_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_dst_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_len_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_len_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_len_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_len_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_skip0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_skip0_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_skip0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_skip0_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_skip1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_skip1_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_skip1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_skip1_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_base0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_base0_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_base0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_base0_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_base1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_base1_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_base1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_base1_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_base2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_base2_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_base2_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_base2_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_base3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_base3_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_base3_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_base3_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_stride_x_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_stride_x_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_stride_x_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_stride_x_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_stride_y_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_stride_y_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_stride_y_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_stride_y_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_stride_c_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_stride_c_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_stride_c_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_stride_c_r copy() + { + return *this; + } +#endif +}; + + +struct weight1_base_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight1_base_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight1_base_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight1_base_r copy() + { + return *this; + } +#endif +}; + + +struct weight1_length_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight1_length_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight1_length_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight1_length_r copy() + { + return *this; + } +#endif +}; + + +struct scale1_base_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR scale1_base_r() : + word0(0), + word1(0) + {} + CONSTEXPR scale1_base_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + scale1_base_r copy() + { + return *this; + } +#endif +}; + + +struct scale1_length_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR scale1_length_r() : + word0(0), + word1(0) + {} + CONSTEXPR scale1_length_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + scale1_length_r copy() + { + return *this; + } +#endif +}; + + +struct revision_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR revision_r() : + word0(0) + {} + CONSTEXPR revision_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + revision_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR revision_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid4_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID4 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid4_r() : + word0(4) + {} + CONSTEXPR pid4_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid4_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID4() const + { + auto v = word0; + return v; + } + CONSTEXPR pid4_r& set_PID4(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid5_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID5 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid5_r() : + word0(0) + {} + CONSTEXPR pid5_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid5_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID5() const + { + auto v = word0; + return v; + } + CONSTEXPR pid5_r& set_PID5(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid6_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID6 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid6_r() : + word0(0) + {} + CONSTEXPR pid6_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid6_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID6() const + { + auto v = word0; + return v; + } + CONSTEXPR pid6_r& set_PID6(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid7_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID7 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid7_r() : + word0(0) + {} + CONSTEXPR pid7_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid7_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID7() const + { + auto v = word0; + return v; + } + CONSTEXPR pid7_r& set_PID7(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID0 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid0_r() : + word0(129) + {} + CONSTEXPR pid0_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid0_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID0() const + { + auto v = word0; + return v; + } + CONSTEXPR pid0_r& set_PID0(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID1 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid1_r() : + word0(181) + {} + CONSTEXPR pid1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID1() const + { + auto v = word0; + return v; + } + CONSTEXPR pid1_r& set_PID1(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID2 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid2_r() : + word0(11) + {} + CONSTEXPR pid2_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid2_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID2() const + { + auto v = word0; + return v; + } + CONSTEXPR pid2_r& set_PID2(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID3 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid3_r() : + word0(0) + {} + CONSTEXPR pid3_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid3_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID3() const + { + auto v = word0; + return v; + } + CONSTEXPR pid3_r& set_PID3(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct cid0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CID0 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cid0_r() : + word0(13) + {} + CONSTEXPR cid0_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cid0_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_CID0() const + { + auto v = word0; + return v; + } + CONSTEXPR cid0_r& set_CID0(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct cid1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CID1 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cid1_r() : + word0(240) + {} + CONSTEXPR cid1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cid1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_CID1() const + { + auto v = word0; + return v; + } + CONSTEXPR cid1_r& set_CID1(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct cid2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CID2 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cid2_r() : + word0(5) + {} + CONSTEXPR cid2_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cid2_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_CID2() const + { + auto v = word0; + return v; + } + CONSTEXPR cid2_r& set_CID2(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct cid3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CID3 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cid3_r() : + word0(177) + {} + CONSTEXPR cid3_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cid3_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_CID3() const + { + auto v = word0; + return v; + } + CONSTEXPR cid3_r& set_CID3(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + +struct NPU_REG +{ + STRUCT id_r ID; + STRUCT status_r STATUS; + STRUCT cmd_r CMD; + STRUCT reset_r RESET; + STRUCT qbase_r QBASE; + STRUCT qread_r QREAD; + STRUCT qconfig_r QCONFIG; + STRUCT qsize_r QSIZE; + STRUCT prot_r PROT; + STRUCT config_r CONFIG; + STRUCT lock_r LOCK; + uint32_t unused0[3]; + STRUCT regioncfg_r REGIONCFG; + STRUCT axi_limit0_r AXI_LIMIT0; + STRUCT axi_limit1_r AXI_LIMIT1; + STRUCT axi_limit2_r AXI_LIMIT2; + STRUCT axi_limit3_r AXI_LIMIT3; + uint32_t unused1[12]; + STRUCT basep_r BASEP[8]; + uint32_t unused2[16]; + STRUCT wd_status_r WD_STATUS; + STRUCT mac_status_r MAC_STATUS; + STRUCT ao_status_r AO_STATUS; + uint32_t unused3[1]; + STRUCT dma_status0_r DMA_STATUS0; + STRUCT dma_status1_r DMA_STATUS1; + uint32_t unused4[10]; + STRUCT clkforce_r CLKFORCE; + STRUCT debug_address_r DEBUG_ADDRESS; + STRUCT debug_misc_r DEBUG_MISC; + STRUCT debugcore_r DEBUGCORE; + STRUCT debug_block_r DEBUG_BLOCK; + uint32_t unused5[11]; + STRUCT pmcr_r PMCR; + STRUCT pmcntenset_r PMCNTENSET; + STRUCT pmcntenclr_r PMCNTENCLR; + STRUCT pmovsset_r PMOVSSET; + STRUCT pmovsclr_r PMOVSCLR; + STRUCT pmintset_r PMINTSET; + STRUCT pmintclr_r PMINTCLR; + uint32_t unused6[1]; + STRUCT pmccntr_r PMCCNTR; + STRUCT pmccntr_cfg_r PMCCNTR_CFG; + STRUCT pmcaxi_chan_r PMCAXI_CHAN; + uint32_t unused7[20]; + STRUCT kernel_x_r KERNEL_X; + STRUCT kernel_y_r KERNEL_Y; + STRUCT kernel_w_m1_r KERNEL_W_M1; + STRUCT kernel_h_m1_r KERNEL_H_M1; + STRUCT ofm_cblk_width_m1_r OFM_CBLK_WIDTH_M1; + STRUCT ofm_cblk_height_m1_r OFM_CBLK_HEIGHT_M1; + STRUCT ofm_cblk_depth_m1_r OFM_CBLK_DEPTH_M1; + STRUCT ifm_cblk_depth_m1_r IFM_CBLK_DEPTH_M1; + STRUCT ofm_x_r OFM_X; + STRUCT ofm_y_r OFM_Y; + STRUCT ofm_z_r OFM_Z; + STRUCT ifm_z_r IFM_Z; + STRUCT pad_top_r PAD_TOP; + STRUCT pad_left_r PAD_LEFT; + STRUCT ifm_cblk_width_r IFM_CBLK_WIDTH; + STRUCT ifm_cblk_height_r IFM_CBLK_HEIGHT; + STRUCT dma_ifm_src_r DMA_IFM_SRC; + STRUCT dma_ifm_dst_r DMA_IFM_DST; + STRUCT dma_ofm_src_r DMA_OFM_SRC; + STRUCT dma_ofm_dst_r DMA_OFM_DST; + STRUCT dma_weight_src_r DMA_WEIGHT_SRC; + STRUCT dma_cmd_src_r DMA_CMD_SRC; + STRUCT dma_cmd_size_r DMA_CMD_SIZE; + STRUCT dma_m2m_src_r DMA_M2M_SRC; + STRUCT dma_m2m_dst_r DMA_M2M_DST; + STRUCT current_qread_r CURRENT_QREAD; + STRUCT dma_scale_src_r DMA_SCALE_SRC; + uint32_t unused8[11]; + STRUCT current_block_r CURRENT_BLOCK; + STRUCT current_op_r CURRENT_OP; + STRUCT current_cmd_r CURRENT_CMD; + uint32_t unused9[16]; + STRUCT pmevcntr_r PMEVCNTR[4]; + uint32_t unused10[28]; + STRUCT pmevtyper_r PMEVTYPER[4]; + uint32_t unused11[28]; + STRUCT shared_buffer_r SHARED_BUFFER[256]; + STRUCT ifm_pad_top_r IFM_PAD_TOP; + STRUCT ifm_pad_left_r IFM_PAD_LEFT; + STRUCT ifm_pad_right_r IFM_PAD_RIGHT; + STRUCT ifm_pad_bottom_r IFM_PAD_BOTTOM; + STRUCT ifm_depth_m1_r IFM_DEPTH_M1; + STRUCT ifm_precision_r IFM_PRECISION; + uint32_t unused12[1]; + STRUCT ifm_upscale_r IFM_UPSCALE; + uint32_t unused13[1]; + STRUCT ifm_zero_point_r IFM_ZERO_POINT; + STRUCT ifm_width0_m1_r IFM_WIDTH0_M1; + STRUCT ifm_height0_m1_r IFM_HEIGHT0_M1; + STRUCT ifm_height1_m1_r IFM_HEIGHT1_M1; + STRUCT ifm_ib_end_r IFM_IB_END; + uint32_t unused14[1]; + STRUCT ifm_region_r IFM_REGION; + uint32_t unused15[1]; + STRUCT ofm_width_m1_r OFM_WIDTH_M1; + STRUCT ofm_height_m1_r OFM_HEIGHT_M1; + STRUCT ofm_depth_m1_r OFM_DEPTH_M1; + STRUCT ofm_precision_r OFM_PRECISION; + STRUCT ofm_blk_width_m1_r OFM_BLK_WIDTH_M1; + STRUCT ofm_blk_height_m1_r OFM_BLK_HEIGHT_M1; + STRUCT ofm_blk_depth_m1_r OFM_BLK_DEPTH_M1; + STRUCT ofm_zero_point_r OFM_ZERO_POINT; + uint32_t unused16[1]; + STRUCT ofm_width0_m1_r OFM_WIDTH0_M1; + STRUCT ofm_height0_m1_r OFM_HEIGHT0_M1; + STRUCT ofm_height1_m1_r OFM_HEIGHT1_M1; + uint32_t unused17[2]; + STRUCT ofm_region_r OFM_REGION; + STRUCT kernel_width_m1_r KERNEL_WIDTH_M1; + STRUCT kernel_height_m1_r KERNEL_HEIGHT_M1; + STRUCT kernel_stride_r KERNEL_STRIDE; + STRUCT parallel_mode_r PARALLEL_MODE; + STRUCT acc_format_r ACC_FORMAT; + STRUCT activation_r ACTIVATION; + STRUCT activation_min_r ACTIVATION_MIN; + STRUCT activation_max_r ACTIVATION_MAX; + STRUCT weight_region_r WEIGHT_REGION; + STRUCT scale_region_r SCALE_REGION; + uint32_t unused18[3]; + STRUCT ab_start_r AB_START; + uint32_t unused19[1]; + STRUCT blockdep_r BLOCKDEP; + STRUCT dma0_src_region_r DMA0_SRC_REGION; + STRUCT dma0_dst_region_r DMA0_DST_REGION; + STRUCT dma0_size0_r DMA0_SIZE0; + STRUCT dma0_size1_r DMA0_SIZE1; + uint32_t unused20[12]; + STRUCT ifm2_broadcast_r IFM2_BROADCAST; + STRUCT ifm2_scalar_r IFM2_SCALAR; + uint32_t unused21[3]; + STRUCT ifm2_precision_r IFM2_PRECISION; + uint32_t unused22[3]; + STRUCT ifm2_zero_point_r IFM2_ZERO_POINT; + STRUCT ifm2_width0_m1_r IFM2_WIDTH0_M1; + STRUCT ifm2_height0_m1_r IFM2_HEIGHT0_M1; + STRUCT ifm2_height1_m1_r IFM2_HEIGHT1_M1; + STRUCT ifm2_ib_start_r IFM2_IB_START; + uint32_t unused23[1]; + STRUCT ifm2_region_r IFM2_REGION; + uint32_t unused24[48]; + STRUCT ifm_base0_r IFM_BASE0; + STRUCT ifm_base1_r IFM_BASE1; + STRUCT ifm_base2_r IFM_BASE2; + STRUCT ifm_base3_r IFM_BASE3; + STRUCT ifm_stride_x_r IFM_STRIDE_X; + STRUCT ifm_stride_y_r IFM_STRIDE_Y; + STRUCT ifm_stride_c_r IFM_STRIDE_C; + uint32_t unused25[2]; + STRUCT ofm_base0_r OFM_BASE0; + STRUCT ofm_base1_r OFM_BASE1; + STRUCT ofm_base2_r OFM_BASE2; + STRUCT ofm_base3_r OFM_BASE3; + STRUCT ofm_stride_x_r OFM_STRIDE_X; + STRUCT ofm_stride_y_r OFM_STRIDE_Y; + STRUCT ofm_stride_c_r OFM_STRIDE_C; + uint32_t unused26[2]; + STRUCT weight_base_r WEIGHT_BASE; + STRUCT weight_length_r WEIGHT_LENGTH; + STRUCT scale_base_r SCALE_BASE; + STRUCT scale_length_r SCALE_LENGTH; + STRUCT ofm_scale_r OFM_SCALE; + STRUCT opa_scale_r OPA_SCALE; + STRUCT opb_scale_r OPB_SCALE; + uint32_t unused27[3]; + STRUCT dma0_src_r DMA0_SRC; + STRUCT dma0_dst_r DMA0_DST; + STRUCT dma0_len_r DMA0_LEN; + STRUCT dma0_skip0_r DMA0_SKIP0; + STRUCT dma0_skip1_r DMA0_SKIP1; + uint32_t unused28[6]; + STRUCT ifm2_base0_r IFM2_BASE0; + STRUCT ifm2_base1_r IFM2_BASE1; + STRUCT ifm2_base2_r IFM2_BASE2; + STRUCT ifm2_base3_r IFM2_BASE3; + STRUCT ifm2_stride_x_r IFM2_STRIDE_X; + STRUCT ifm2_stride_y_r IFM2_STRIDE_Y; + STRUCT ifm2_stride_c_r IFM2_STRIDE_C; + uint32_t unused29[2]; + STRUCT weight1_base_r WEIGHT1_BASE; + STRUCT weight1_length_r WEIGHT1_LENGTH; + STRUCT scale1_base_r SCALE1_BASE; + STRUCT scale1_length_r SCALE1_LENGTH; + uint32_t unused30[280]; + STRUCT revision_r REVISION; + uint32_t unused31[3]; + STRUCT pid4_r PID4; + STRUCT pid5_r PID5; + STRUCT pid6_r PID6; + STRUCT pid7_r PID7; + STRUCT pid0_r PID0; + STRUCT pid1_r PID1; + STRUCT pid2_r PID2; + STRUCT pid3_r PID3; + STRUCT cid0_r CID0; + STRUCT cid1_r CID1; + STRUCT cid2_r CID2; + STRUCT cid3_r CID3; + +#ifdef __cplusplus + enum class access_type_t : uint8_t { RW, RO, WO }; + NPU_REG() + { + reset(); + } + void reset() + { + ID = 268853249; + STATUS = 8; + CMD = 12; + RESET = 0; + QBASE = 0; + QREAD = 0; + QCONFIG = 0; + QSIZE = 0; + PROT = 0; + CONFIG = 268435456; + LOCK = 0; + REGIONCFG = 0; + AXI_LIMIT0 = 0; + AXI_LIMIT1 = 0; + AXI_LIMIT2 = 0; + AXI_LIMIT3 = 0; + for (size_t i = 0; i < (sizeof(BASEP) / sizeof(BASEP[0])); ++i) + BASEP[i] = 0; + WD_STATUS = 0; + MAC_STATUS = 0; + AO_STATUS = 0; + DMA_STATUS0 = 0; + DMA_STATUS1 = 0; + CLKFORCE = 0; + DEBUG_ADDRESS = 0; + DEBUG_MISC = 0; + DEBUGCORE = 0; + DEBUG_BLOCK = 0; + PMCR = 8192; + PMCNTENSET = 0; + PMCNTENCLR = 0; + PMOVSSET = 0; + PMOVSCLR = 0; + PMINTSET = 0; + PMINTCLR = 0; + PMCCNTR = 0; + PMCCNTR_CFG = 0; + PMCAXI_CHAN = 0; + KERNEL_X = 0; + KERNEL_Y = 0; + KERNEL_W_M1 = 0; + KERNEL_H_M1 = 0; + OFM_CBLK_WIDTH_M1 = 0; + OFM_CBLK_HEIGHT_M1 = 0; + OFM_CBLK_DEPTH_M1 = 0; + IFM_CBLK_DEPTH_M1 = 0; + OFM_X = 0; + OFM_Y = 0; + OFM_Z = 0; + IFM_Z = 0; + PAD_TOP = 0; + PAD_LEFT = 0; + IFM_CBLK_WIDTH = 0; + IFM_CBLK_HEIGHT = 0; + DMA_IFM_SRC = 0; + DMA_IFM_DST = 0; + DMA_OFM_SRC = 0; + DMA_OFM_DST = 0; + DMA_WEIGHT_SRC = 0; + DMA_CMD_SRC = 0; + DMA_CMD_SIZE = 0; + DMA_M2M_SRC = 0; + DMA_M2M_DST = 0; + CURRENT_QREAD = 0; + DMA_SCALE_SRC = 0; + CURRENT_BLOCK = 0; + CURRENT_OP = 0; + CURRENT_CMD = 0; + for (size_t i = 0; i < (sizeof(PMEVCNTR) / sizeof(PMEVCNTR[0])); ++i) + PMEVCNTR[i] = 0; + for (size_t i = 0; i < (sizeof(PMEVTYPER) / sizeof(PMEVTYPER[0])); ++i) + PMEVTYPER[i] = 0; + for (size_t i = 0; i < (sizeof(SHARED_BUFFER) / sizeof(SHARED_BUFFER[0])); ++i) + SHARED_BUFFER[i] = 0; + IFM_PAD_TOP = 0; + IFM_PAD_LEFT = 0; + IFM_PAD_RIGHT = 0; + IFM_PAD_BOTTOM = 0; + IFM_DEPTH_M1 = 0; + IFM_PRECISION = 0; + IFM_UPSCALE = 0; + IFM_ZERO_POINT = 0; + IFM_WIDTH0_M1 = 0; + IFM_HEIGHT0_M1 = 0; + IFM_HEIGHT1_M1 = 0; + IFM_IB_END = 0; + IFM_REGION = 0; + OFM_WIDTH_M1 = 0; + OFM_HEIGHT_M1 = 0; + OFM_DEPTH_M1 = 0; + OFM_PRECISION = 0; + OFM_BLK_WIDTH_M1 = 0; + OFM_BLK_HEIGHT_M1 = 0; + OFM_BLK_DEPTH_M1 = 0; + OFM_ZERO_POINT = 0; + OFM_WIDTH0_M1 = 0; + OFM_HEIGHT0_M1 = 0; + OFM_HEIGHT1_M1 = 0; + OFM_REGION = 0; + KERNEL_WIDTH_M1 = 0; + KERNEL_HEIGHT_M1 = 0; + KERNEL_STRIDE = 0; + PARALLEL_MODE = 0; + ACC_FORMAT = 0; + ACTIVATION = 0; + ACTIVATION_MIN = 0; + ACTIVATION_MAX = 0; + WEIGHT_REGION = 0; + SCALE_REGION = 0; + AB_START = 0; + BLOCKDEP = 0; + DMA0_SRC_REGION = 0; + DMA0_DST_REGION = 0; + DMA0_SIZE0 = 0; + DMA0_SIZE1 = 0; + IFM2_BROADCAST = 0; + IFM2_SCALAR = 0; + IFM2_PRECISION = 0; + IFM2_ZERO_POINT = 0; + IFM2_WIDTH0_M1 = 0; + IFM2_HEIGHT0_M1 = 0; + IFM2_HEIGHT1_M1 = 0; + IFM2_IB_START = 0; + IFM2_REGION = 0; + IFM_BASE0 = 0; + IFM_BASE1 = 0; + IFM_BASE2 = 0; + IFM_BASE3 = 0; + IFM_STRIDE_X = 0; + IFM_STRIDE_Y = 0; + IFM_STRIDE_C = 0; + OFM_BASE0 = 0; + OFM_BASE1 = 0; + OFM_BASE2 = 0; + OFM_BASE3 = 0; + OFM_STRIDE_X = 0; + OFM_STRIDE_Y = 0; + OFM_STRIDE_C = 0; + WEIGHT_BASE = 0; + WEIGHT_LENGTH = 0; + SCALE_BASE = 0; + SCALE_LENGTH = 0; + OFM_SCALE = 0; + OPA_SCALE = 0; + OPB_SCALE = 0; + DMA0_SRC = 0; + DMA0_DST = 0; + DMA0_LEN = 0; + DMA0_SKIP0 = 0; + DMA0_SKIP1 = 0; + IFM2_BASE0 = 0; + IFM2_BASE1 = 0; + IFM2_BASE2 = 0; + IFM2_BASE3 = 0; + IFM2_STRIDE_X = 0; + IFM2_STRIDE_Y = 0; + IFM2_STRIDE_C = 0; + WEIGHT1_BASE = 0; + WEIGHT1_LENGTH = 0; + SCALE1_BASE = 0; + SCALE1_LENGTH = 0; + REVISION = 0; + PID4 = 4; + PID5 = 0; + PID6 = 0; + PID7 = 0; + PID0 = 129; + PID1 = 181; + PID2 = 11; + PID3 = 0; + CID0 = 13; + CID1 = 240; + CID2 = 5; + CID3 = 177; + } + uint32_t& operator[](const int addr_offset) + { + return reinterpret_cast(this)[addr_offset / 4]; + } + access_type_t get_access_type(uint32_t offset) + { + switch (offset) + { + case 0: return access_type_t::RO; + case 4: return access_type_t::RO; + case 8: return access_type_t::RW; + case 12: return access_type_t::RW; + case 16: return access_type_t::RW; + case 24: return access_type_t::RO; + case 28: return access_type_t::RW; + case 32: return access_type_t::RW; + case 36: return access_type_t::RO; + case 40: return access_type_t::RO; + case 44: return access_type_t::RW; + case 60: return access_type_t::RW; + case 64: return access_type_t::RW; + case 68: return access_type_t::RW; + case 72: return access_type_t::RW; + case 76: return access_type_t::RW; + case 128: return access_type_t::RW; + case 136: return access_type_t::RW; + case 144: return access_type_t::RW; + case 152: return access_type_t::RW; + case 160: return access_type_t::RW; + case 168: return access_type_t::RW; + case 176: return access_type_t::RW; + case 184: return access_type_t::RW; + case 256: return access_type_t::RO; + case 260: return access_type_t::RO; + case 264: return access_type_t::RO; + case 272: return access_type_t::RO; + case 276: return access_type_t::RO; + case 320: return access_type_t::RW; + case 324: return access_type_t::RW; + case 328: return access_type_t::RW; + case 332: return access_type_t::RW; + case 336: return access_type_t::RW; + case 384: return access_type_t::RW; + case 388: return access_type_t::RW; + case 392: return access_type_t::RW; + case 396: return access_type_t::RW; + case 400: return access_type_t::RW; + case 404: return access_type_t::RW; + case 408: return access_type_t::RW; + case 416: return access_type_t::RW; + case 424: return access_type_t::RW; + case 428: return access_type_t::RW; + case 512: return access_type_t::RO; + case 516: return access_type_t::RO; + case 520: return access_type_t::RO; + case 524: return access_type_t::RO; + case 528: return access_type_t::RO; + case 532: return access_type_t::RO; + case 536: return access_type_t::RO; + case 540: return access_type_t::RO; + case 544: return access_type_t::RO; + case 548: return access_type_t::RO; + case 552: return access_type_t::RO; + case 556: return access_type_t::RO; + case 560: return access_type_t::RO; + case 564: return access_type_t::RO; + case 568: return access_type_t::RO; + case 572: return access_type_t::RO; + case 576: return access_type_t::RO; + case 584: return access_type_t::RO; + case 588: return access_type_t::RO; + case 592: return access_type_t::RO; + case 600: return access_type_t::RO; + case 608: return access_type_t::RO; + case 616: return access_type_t::RO; + case 620: return access_type_t::RO; + case 628: return access_type_t::RO; + case 636: return access_type_t::RO; + case 640: return access_type_t::RO; + case 692: return access_type_t::RO; + case 696: return access_type_t::RO; + case 700: return access_type_t::RO; + case 768: return access_type_t::RW; + case 772: return access_type_t::RW; + case 776: return access_type_t::RW; + case 780: return access_type_t::RW; + case 896: return access_type_t::RW; + case 900: return access_type_t::RW; + case 904: return access_type_t::RW; + case 908: return access_type_t::RW; + case 1024: return access_type_t::RW; + case 1028: return access_type_t::RW; + case 1032: return access_type_t::RW; + case 1036: return access_type_t::RW; + case 1040: return access_type_t::RW; + case 1044: return access_type_t::RW; + case 1048: return access_type_t::RW; + case 1052: return access_type_t::RW; + case 1056: return access_type_t::RW; + case 1060: return access_type_t::RW; + case 1064: return access_type_t::RW; + case 1068: return access_type_t::RW; + case 1072: return access_type_t::RW; + case 1076: return access_type_t::RW; + case 1080: return access_type_t::RW; + case 1084: return access_type_t::RW; + case 1088: return access_type_t::RW; + case 1092: return access_type_t::RW; + case 1096: return access_type_t::RW; + case 1100: return access_type_t::RW; + case 1104: return access_type_t::RW; + case 1108: return access_type_t::RW; + case 1112: return access_type_t::RW; + case 1116: return access_type_t::RW; + case 1120: return access_type_t::RW; + case 1124: return access_type_t::RW; + case 1128: return access_type_t::RW; + case 1132: return access_type_t::RW; + case 1136: return access_type_t::RW; + case 1140: return access_type_t::RW; + case 1144: return access_type_t::RW; + case 1148: return access_type_t::RW; + case 1152: return access_type_t::RW; + case 1156: return access_type_t::RW; + case 1160: return access_type_t::RW; + case 1164: return access_type_t::RW; + case 1168: return access_type_t::RW; + case 1172: return access_type_t::RW; + case 1176: return access_type_t::RW; + case 1180: return access_type_t::RW; + case 1184: return access_type_t::RW; + case 1188: return access_type_t::RW; + case 1192: return access_type_t::RW; + case 1196: return access_type_t::RW; + case 1200: return access_type_t::RW; + case 1204: return access_type_t::RW; + case 1208: return access_type_t::RW; + case 1212: return access_type_t::RW; + case 1216: return access_type_t::RW; + case 1220: return access_type_t::RW; + case 1224: return access_type_t::RW; + case 1228: return access_type_t::RW; + case 1232: return access_type_t::RW; + case 1236: return access_type_t::RW; + case 1240: return access_type_t::RW; + case 1244: return access_type_t::RW; + case 1248: return access_type_t::RW; + case 1252: return access_type_t::RW; + case 1256: return access_type_t::RW; + case 1260: return access_type_t::RW; + case 1264: return access_type_t::RW; + case 1268: return access_type_t::RW; + case 1272: return access_type_t::RW; + case 1276: return access_type_t::RW; + case 1280: return access_type_t::RW; + case 1284: return access_type_t::RW; + case 1288: return access_type_t::RW; + case 1292: return access_type_t::RW; + case 1296: return access_type_t::RW; + case 1300: return access_type_t::RW; + case 1304: return access_type_t::RW; + case 1308: return access_type_t::RW; + case 1312: return access_type_t::RW; + case 1316: return access_type_t::RW; + case 1320: return access_type_t::RW; + case 1324: return access_type_t::RW; + case 1328: return access_type_t::RW; + case 1332: return access_type_t::RW; + case 1336: return access_type_t::RW; + case 1340: return access_type_t::RW; + case 1344: return access_type_t::RW; + case 1348: return access_type_t::RW; + case 1352: return access_type_t::RW; + case 1356: return access_type_t::RW; + case 1360: return access_type_t::RW; + case 1364: return access_type_t::RW; + case 1368: return access_type_t::RW; + case 1372: return access_type_t::RW; + case 1376: return access_type_t::RW; + case 1380: return access_type_t::RW; + case 1384: return access_type_t::RW; + case 1388: return access_type_t::RW; + case 1392: return access_type_t::RW; + case 1396: return access_type_t::RW; + case 1400: return access_type_t::RW; + case 1404: return access_type_t::RW; + case 1408: return access_type_t::RW; + case 1412: return access_type_t::RW; + case 1416: return access_type_t::RW; + case 1420: return access_type_t::RW; + case 1424: return access_type_t::RW; + case 1428: return access_type_t::RW; + case 1432: return access_type_t::RW; + case 1436: return access_type_t::RW; + case 1440: return access_type_t::RW; + case 1444: return access_type_t::RW; + case 1448: return access_type_t::RW; + case 1452: return access_type_t::RW; + case 1456: return access_type_t::RW; + case 1460: return access_type_t::RW; + case 1464: return access_type_t::RW; + case 1468: return access_type_t::RW; + case 1472: return access_type_t::RW; + case 1476: return access_type_t::RW; + case 1480: return access_type_t::RW; + case 1484: return access_type_t::RW; + case 1488: return access_type_t::RW; + case 1492: return access_type_t::RW; + case 1496: return access_type_t::RW; + case 1500: return access_type_t::RW; + case 1504: return access_type_t::RW; + case 1508: return access_type_t::RW; + case 1512: return access_type_t::RW; + case 1516: return access_type_t::RW; + case 1520: return access_type_t::RW; + case 1524: return access_type_t::RW; + case 1528: return access_type_t::RW; + case 1532: return access_type_t::RW; + case 1536: return access_type_t::RW; + case 1540: return access_type_t::RW; + case 1544: return access_type_t::RW; + case 1548: return access_type_t::RW; + case 1552: return access_type_t::RW; + case 1556: return access_type_t::RW; + case 1560: return access_type_t::RW; + case 1564: return access_type_t::RW; + case 1568: return access_type_t::RW; + case 1572: return access_type_t::RW; + case 1576: return access_type_t::RW; + case 1580: return access_type_t::RW; + case 1584: return access_type_t::RW; + case 1588: return access_type_t::RW; + case 1592: return access_type_t::RW; + case 1596: return access_type_t::RW; + case 1600: return access_type_t::RW; + case 1604: return access_type_t::RW; + case 1608: return access_type_t::RW; + case 1612: return access_type_t::RW; + case 1616: return access_type_t::RW; + case 1620: return access_type_t::RW; + case 1624: return access_type_t::RW; + case 1628: return access_type_t::RW; + case 1632: return access_type_t::RW; + case 1636: return access_type_t::RW; + case 1640: return access_type_t::RW; + case 1644: return access_type_t::RW; + case 1648: return access_type_t::RW; + case 1652: return access_type_t::RW; + case 1656: return access_type_t::RW; + case 1660: return access_type_t::RW; + case 1664: return access_type_t::RW; + case 1668: return access_type_t::RW; + case 1672: return access_type_t::RW; + case 1676: return access_type_t::RW; + case 1680: return access_type_t::RW; + case 1684: return access_type_t::RW; + case 1688: return access_type_t::RW; + case 1692: return access_type_t::RW; + case 1696: return access_type_t::RW; + case 1700: return access_type_t::RW; + case 1704: return access_type_t::RW; + case 1708: return access_type_t::RW; + case 1712: return access_type_t::RW; + case 1716: return access_type_t::RW; + case 1720: return access_type_t::RW; + case 1724: return access_type_t::RW; + case 1728: return access_type_t::RW; + case 1732: return access_type_t::RW; + case 1736: return access_type_t::RW; + case 1740: return access_type_t::RW; + case 1744: return access_type_t::RW; + case 1748: return access_type_t::RW; + case 1752: return access_type_t::RW; + case 1756: return access_type_t::RW; + case 1760: return access_type_t::RW; + case 1764: return access_type_t::RW; + case 1768: return access_type_t::RW; + case 1772: return access_type_t::RW; + case 1776: return access_type_t::RW; + case 1780: return access_type_t::RW; + case 1784: return access_type_t::RW; + case 1788: return access_type_t::RW; + case 1792: return access_type_t::RW; + case 1796: return access_type_t::RW; + case 1800: return access_type_t::RW; + case 1804: return access_type_t::RW; + case 1808: return access_type_t::RW; + case 1812: return access_type_t::RW; + case 1816: return access_type_t::RW; + case 1820: return access_type_t::RW; + case 1824: return access_type_t::RW; + case 1828: return access_type_t::RW; + case 1832: return access_type_t::RW; + case 1836: return access_type_t::RW; + case 1840: return access_type_t::RW; + case 1844: return access_type_t::RW; + case 1848: return access_type_t::RW; + case 1852: return access_type_t::RW; + case 1856: return access_type_t::RW; + case 1860: return access_type_t::RW; + case 1864: return access_type_t::RW; + case 1868: return access_type_t::RW; + case 1872: return access_type_t::RW; + case 1876: return access_type_t::RW; + case 1880: return access_type_t::RW; + case 1884: return access_type_t::RW; + case 1888: return access_type_t::RW; + case 1892: return access_type_t::RW; + case 1896: return access_type_t::RW; + case 1900: return access_type_t::RW; + case 1904: return access_type_t::RW; + case 1908: return access_type_t::RW; + case 1912: return access_type_t::RW; + case 1916: return access_type_t::RW; + case 1920: return access_type_t::RW; + case 1924: return access_type_t::RW; + case 1928: return access_type_t::RW; + case 1932: return access_type_t::RW; + case 1936: return access_type_t::RW; + case 1940: return access_type_t::RW; + case 1944: return access_type_t::RW; + case 1948: return access_type_t::RW; + case 1952: return access_type_t::RW; + case 1956: return access_type_t::RW; + case 1960: return access_type_t::RW; + case 1964: return access_type_t::RW; + case 1968: return access_type_t::RW; + case 1972: return access_type_t::RW; + case 1976: return access_type_t::RW; + case 1980: return access_type_t::RW; + case 1984: return access_type_t::RW; + case 1988: return access_type_t::RW; + case 1992: return access_type_t::RW; + case 1996: return access_type_t::RW; + case 2000: return access_type_t::RW; + case 2004: return access_type_t::RW; + case 2008: return access_type_t::RW; + case 2012: return access_type_t::RW; + case 2016: return access_type_t::RW; + case 2020: return access_type_t::RW; + case 2024: return access_type_t::RW; + case 2028: return access_type_t::RW; + case 2032: return access_type_t::RW; + case 2036: return access_type_t::RW; + case 2040: return access_type_t::RW; + case 2044: return access_type_t::RW; + case 2048: return access_type_t::RW; + case 2052: return access_type_t::RW; + case 2056: return access_type_t::RW; + case 2060: return access_type_t::RW; + case 2064: return access_type_t::RW; + case 2068: return access_type_t::RW; + case 2076: return access_type_t::RW; + case 2084: return access_type_t::RW; + case 2088: return access_type_t::RW; + case 2092: return access_type_t::RW; + case 2096: return access_type_t::RW; + case 2100: return access_type_t::RW; + case 2108: return access_type_t::RW; + case 2116: return access_type_t::RW; + case 2120: return access_type_t::RW; + case 2124: return access_type_t::RW; + case 2128: return access_type_t::RW; + case 2132: return access_type_t::RW; + case 2136: return access_type_t::RW; + case 2140: return access_type_t::RW; + case 2144: return access_type_t::RW; + case 2152: return access_type_t::RW; + case 2156: return access_type_t::RW; + case 2160: return access_type_t::RW; + case 2172: return access_type_t::RW; + case 2176: return access_type_t::RW; + case 2180: return access_type_t::RW; + case 2184: return access_type_t::RW; + case 2188: return access_type_t::RW; + case 2192: return access_type_t::RW; + case 2196: return access_type_t::RW; + case 2200: return access_type_t::RW; + case 2204: return access_type_t::RW; + case 2208: return access_type_t::RW; + case 2212: return access_type_t::RW; + case 2228: return access_type_t::RW; + case 2236: return access_type_t::RW; + case 2240: return access_type_t::RW; + case 2244: return access_type_t::RW; + case 2248: return access_type_t::RW; + case 2252: return access_type_t::RW; + case 2304: return access_type_t::RW; + case 2308: return access_type_t::RW; + case 2324: return access_type_t::RW; + case 2340: return access_type_t::RW; + case 2344: return access_type_t::RW; + case 2348: return access_type_t::RW; + case 2352: return access_type_t::RW; + case 2356: return access_type_t::RW; + case 2364: return access_type_t::RW; + case 2560: return access_type_t::RW; + case 2568: return access_type_t::RW; + case 2576: return access_type_t::RW; + case 2584: return access_type_t::RW; + case 2592: return access_type_t::RW; + case 2600: return access_type_t::RW; + case 2608: return access_type_t::RW; + case 2624: return access_type_t::RW; + case 2632: return access_type_t::RW; + case 2640: return access_type_t::RW; + case 2648: return access_type_t::RW; + case 2656: return access_type_t::RW; + case 2664: return access_type_t::RW; + case 2672: return access_type_t::RW; + case 2688: return access_type_t::RW; + case 2696: return access_type_t::RW; + case 2704: return access_type_t::RW; + case 2712: return access_type_t::RW; + case 2720: return access_type_t::RW; + case 2728: return access_type_t::RW; + case 2736: return access_type_t::RW; + case 2752: return access_type_t::RW; + case 2760: return access_type_t::RW; + case 2768: return access_type_t::RW; + case 2776: return access_type_t::RW; + case 2784: return access_type_t::RW; + case 2816: return access_type_t::RW; + case 2824: return access_type_t::RW; + case 2832: return access_type_t::RW; + case 2840: return access_type_t::RW; + case 2848: return access_type_t::RW; + case 2856: return access_type_t::RW; + case 2864: return access_type_t::RW; + case 2880: return access_type_t::RW; + case 2888: return access_type_t::RW; + case 2896: return access_type_t::RW; + case 2904: return access_type_t::RW; + case 4032: return access_type_t::RO; + case 4048: return access_type_t::RO; + case 4052: return access_type_t::RO; + case 4056: return access_type_t::RO; + case 4060: return access_type_t::RO; + case 4064: return access_type_t::RO; + case 4068: return access_type_t::RO; + case 4072: return access_type_t::RO; + case 4076: return access_type_t::RO; + case 4080: return access_type_t::RO; + case 4084: return access_type_t::RO; + case 4088: return access_type_t::RO; + case 4092: return access_type_t::RO; + default: return access_type_t::RO; + } + } +#endif +}; + +#ifdef __cplusplus +struct isa +{ +#ifdef NPU_DISASSEMBLE +static int disassemble(const uint32_t* in, std::string& op, std::vector>& fields) +{ + switch (*in & 0xffff) + { + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP): + { + const npu_op_stop_t& v = *reinterpret_cast(in); + op = "NPU_OP_STOP"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ): + { + const npu_op_irq_t& v = *reinterpret_cast(in); + op = "NPU_OP_IRQ"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV): + { + const npu_op_conv_t& v = *reinterpret_cast(in); + op = "NPU_OP_CONV"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE): + { + const npu_op_depthwise_t& v = *reinterpret_cast(in); + op = "NPU_OP_DEPTHWISE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL): + { + const npu_op_pool_t& v = *reinterpret_cast(in); + op = "NPU_OP_POOL"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE): + { + const npu_op_elementwise_t& v = *reinterpret_cast(in); + op = "NPU_OP_ELEMENTWISE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START): + { + const npu_op_dma_start_t& v = *reinterpret_cast(in); + op = "NPU_OP_DMA_START"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT): + { + const npu_op_dma_wait_t& v = *reinterpret_cast(in); + op = "NPU_OP_DMA_WAIT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT): + { + const npu_op_kernel_wait_t& v = *reinterpret_cast(in); + op = "NPU_OP_KERNEL_WAIT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK): + { + const npu_op_pmu_mask_t& v = *reinterpret_cast(in); + op = "NPU_OP_PMU_MASK"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP): + { + const npu_set_ifm_pad_top_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_PAD_TOP"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT): + { + const npu_set_ifm_pad_left_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_PAD_LEFT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT): + { + const npu_set_ifm_pad_right_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_PAD_RIGHT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM): + { + const npu_set_ifm_pad_bottom_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_PAD_BOTTOM"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1): + { + const npu_set_ifm_depth_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_DEPTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION): + { + const npu_set_ifm_precision_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_PRECISION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE): + { + const npu_set_ifm_upscale_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_UPSCALE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT): + { + const npu_set_ifm_zero_point_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_ZERO_POINT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1): + { + const npu_set_ifm_width0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_WIDTH0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1): + { + const npu_set_ifm_height0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_HEIGHT0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1): + { + const npu_set_ifm_height1_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_HEIGHT1_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_IB_END): + { + const npu_set_ifm_ib_end_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_IB_END"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION): + { + const npu_set_ifm_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1): + { + const npu_set_ofm_width_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_WIDTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1): + { + const npu_set_ofm_height_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_HEIGHT_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1): + { + const npu_set_ofm_depth_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_DEPTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION): + { + const npu_set_ofm_precision_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_PRECISION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1): + { + const npu_set_ofm_blk_width_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BLK_WIDTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1): + { + const npu_set_ofm_blk_height_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BLK_HEIGHT_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1): + { + const npu_set_ofm_blk_depth_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BLK_DEPTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT): + { + const npu_set_ofm_zero_point_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_ZERO_POINT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1): + { + const npu_set_ofm_width0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_WIDTH0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1): + { + const npu_set_ofm_height0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_HEIGHT0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1): + { + const npu_set_ofm_height1_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_HEIGHT1_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION): + { + const npu_set_ofm_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1): + { + const npu_set_kernel_width_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_KERNEL_WIDTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1): + { + const npu_set_kernel_height_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_KERNEL_HEIGHT_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE): + { + const npu_set_kernel_stride_t& v = *reinterpret_cast(in); + op = "NPU_SET_KERNEL_STRIDE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_PARALLEL_MODE): + { + const npu_set_parallel_mode_t& v = *reinterpret_cast(in); + op = "NPU_SET_PARALLEL_MODE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT): + { + const npu_set_acc_format_t& v = *reinterpret_cast(in); + op = "NPU_SET_ACC_FORMAT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION): + { + const npu_set_activation_t& v = *reinterpret_cast(in); + op = "NPU_SET_ACTIVATION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN): + { + const npu_set_activation_min_t& v = *reinterpret_cast(in); + op = "NPU_SET_ACTIVATION_MIN"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX): + { + const npu_set_activation_max_t& v = *reinterpret_cast(in); + op = "NPU_SET_ACTIVATION_MAX"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION): + { + const npu_set_weight_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION): + { + const npu_set_scale_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_SCALE_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_AB_START): + { + const npu_set_ab_start_t& v = *reinterpret_cast(in); + op = "NPU_SET_AB_START"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP): + { + const npu_set_blockdep_t& v = *reinterpret_cast(in); + op = "NPU_SET_BLOCKDEP"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION): + { + const npu_set_dma0_src_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SRC_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION): + { + const npu_set_dma0_dst_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_DST_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0): + { + const npu_set_dma0_size0_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SIZE0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1): + { + const npu_set_dma0_size1_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SIZE1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST): + { + const npu_set_ifm2_broadcast_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_BROADCAST"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_SCALAR): + { + const npu_set_ifm2_scalar_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_SCALAR"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION): + { + const npu_set_ifm2_precision_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_PRECISION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT): + { + const npu_set_ifm2_zero_point_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_ZERO_POINT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1): + { + const npu_set_ifm2_width0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_WIDTH0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1): + { + const npu_set_ifm2_height0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_HEIGHT0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1): + { + const npu_set_ifm2_height1_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_HEIGHT1_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_IB_START): + { + const npu_set_ifm2_ib_start_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_IB_START"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION): + { + const npu_set_ifm2_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0): + { + const npu_set_ifm_base0_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_BASE0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1): + { + const npu_set_ifm_base1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_BASE1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2): + { + const npu_set_ifm_base2_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_BASE2"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3): + { + const npu_set_ifm_base3_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_BASE3"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X): + { + const npu_set_ifm_stride_x_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_STRIDE_X"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y): + { + const npu_set_ifm_stride_y_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_STRIDE_Y"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C): + { + const npu_set_ifm_stride_c_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_STRIDE_C"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0): + { + const npu_set_ofm_base0_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BASE0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1): + { + const npu_set_ofm_base1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BASE1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2): + { + const npu_set_ofm_base2_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BASE2"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3): + { + const npu_set_ofm_base3_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BASE3"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X): + { + const npu_set_ofm_stride_x_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_STRIDE_X"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y): + { + const npu_set_ofm_stride_y_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_STRIDE_Y"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C): + { + const npu_set_ofm_stride_c_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_STRIDE_C"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE): + { + const npu_set_weight_base_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT_BASE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH): + { + const npu_set_weight_length_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT_LENGTH"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE): + { + const npu_set_scale_base_t& v = *reinterpret_cast(in); + op = "NPU_SET_SCALE_BASE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH): + { + const npu_set_scale_length_t& v = *reinterpret_cast(in); + op = "NPU_SET_SCALE_LENGTH"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE): + { + const npu_set_ofm_scale_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_SCALE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPA_SCALE): + { + const npu_set_opa_scale_t& v = *reinterpret_cast(in); + op = "NPU_SET_OPA_SCALE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPB_SCALE): + { + const npu_set_opb_scale_t& v = *reinterpret_cast(in); + op = "NPU_SET_OPB_SCALE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC): + { + const npu_set_dma0_src_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SRC"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST): + { + const npu_set_dma0_dst_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_DST"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN): + { + const npu_set_dma0_len_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_LEN"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP0): + { + const npu_set_dma0_skip0_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SKIP0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP1): + { + const npu_set_dma0_skip1_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SKIP1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0): + { + const npu_set_ifm2_base0_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_BASE0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1): + { + const npu_set_ifm2_base1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_BASE1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2): + { + const npu_set_ifm2_base2_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_BASE2"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3): + { + const npu_set_ifm2_base3_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_BASE3"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X): + { + const npu_set_ifm2_stride_x_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_STRIDE_X"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y): + { + const npu_set_ifm2_stride_y_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_STRIDE_Y"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C): + { + const npu_set_ifm2_stride_c_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_STRIDE_C"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE): + { + const npu_set_weight1_base_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT1_BASE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH): + { + const npu_set_weight1_length_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT1_LENGTH"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_BASE): + { + const npu_set_scale1_base_t& v = *reinterpret_cast(in); + op = "NPU_SET_SCALE1_BASE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_LENGTH): + { + const npu_set_scale1_length_t& v = *reinterpret_cast(in); + op = "NPU_SET_SCALE1_LENGTH"; + v.disassemble(fields); + break; + } + default: break; + } + return (*in & (3<<14)) != 0 ? 2 : 1; +} +#endif +#endif + +struct npu_op_stop_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t mask:16; +#ifdef __cplusplus +public: + npu_op_stop_t(uint32_t _mask) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mask(_mask & ((1U << 16)-1)) + {} + CONSTEXPR npu_op_stop_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mask(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(mask) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_stop_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_stop_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_mask() const + { + return static_cast(mask); + } + CONSTEXPR npu_op_stop_t& set_mask(uint32_t value) + { + assert((value >> 16) == 0); + mask = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("mask", std::to_string(mask))); + } +#endif +#endif +}; + +struct npu_op_irq_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t mask:16; +#ifdef __cplusplus +public: + npu_op_irq_t(uint32_t _mask) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mask(_mask & ((1U << 16)-1)) + {} + CONSTEXPR npu_op_irq_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mask(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(mask) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_irq_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_irq_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_mask() const + { + return static_cast(mask); + } + CONSTEXPR npu_op_irq_t& set_mask(uint32_t value) + { + assert((value >> 16) == 0); + mask = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("mask", std::to_string(mask))); + } +#endif +#endif +}; + +struct npu_op_conv_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; +#ifdef __cplusplus +public: + CONSTEXPR npu_op_conv_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_conv_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_conv_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>&) const + { + } +#endif +#endif +}; + +struct npu_op_depthwise_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; +#ifdef __cplusplus +public: + CONSTEXPR npu_op_depthwise_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_depthwise_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_depthwise_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>&) const + { + } +#endif +#endif +}; + +struct npu_op_pool_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t pooling_mode:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_op_pool_t(NPU_NAMESPACE::pooling_mode _pooling_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pooling_mode(static_cast(_pooling_mode) & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_pool_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pooling_mode(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(pooling_mode) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_pool_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_pool_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pooling_mode get_pooling_mode() const + { + return static_cast(pooling_mode); + } + CONSTEXPR npu_op_pool_t& set_pooling_mode(NPU_NAMESPACE::pooling_mode value) + { + pooling_mode = static_cast(value) & ((1U << 3)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("pooling_mode", (pooling_mode < (sizeof(pooling_mode_str)/sizeof(pooling_mode_str[0])) ? pooling_mode_str[pooling_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_op_elementwise_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t elementwise_mode:6; + uint32_t reserved1:10; +#ifdef __cplusplus +public: + npu_op_elementwise_t(NPU_NAMESPACE::elementwise_mode _elementwise_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + elementwise_mode(static_cast(_elementwise_mode) & ((1U << 6)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_elementwise_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + elementwise_mode(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(elementwise_mode) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_elementwise_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_elementwise_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::elementwise_mode get_elementwise_mode() const + { + return static_cast(elementwise_mode); + } + CONSTEXPR npu_op_elementwise_t& set_elementwise_mode(NPU_NAMESPACE::elementwise_mode value) + { + elementwise_mode = static_cast(value) & ((1U << 6)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("elementwise_mode", (elementwise_mode < (sizeof(elementwise_mode_str)/sizeof(elementwise_mode_str[0])) ? elementwise_mode_str[elementwise_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_op_dma_start_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; +#ifdef __cplusplus +public: + CONSTEXPR npu_op_dma_start_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_dma_start_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_dma_start_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>&) const + { + } +#endif +#endif +}; + +struct npu_op_dma_wait_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t k:1; + uint32_t reserved1:15; +#ifdef __cplusplus +public: + npu_op_dma_wait_t(uint32_t _k) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + k(_k & ((1U << 1)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_dma_wait_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + k(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(k) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_dma_wait_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_dma_wait_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_k() const + { + return static_cast(k); + } + CONSTEXPR npu_op_dma_wait_t& set_k(uint32_t value) + { + assert((value >> 1) == 0); + k = static_cast(value & ((1U << 1)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("k", std::to_string(k))); + } +#endif +#endif +}; + +struct npu_op_kernel_wait_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t n:2; + uint32_t reserved1:14; +#ifdef __cplusplus +public: + npu_op_kernel_wait_t(uint32_t _n) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + n(_n & ((1U << 2)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_kernel_wait_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + n(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(n) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_kernel_wait_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_kernel_wait_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_n() const + { + return static_cast(n); + } + CONSTEXPR npu_op_kernel_wait_t& set_n(uint32_t value) + { + assert((value >> 2) == 0); + n = static_cast(value & ((1U << 2)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("n", std::to_string(n))); + } +#endif +#endif +}; + +struct npu_op_pmu_mask_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t enable:1; + uint32_t reserved1:15; +#ifdef __cplusplus +public: + npu_op_pmu_mask_t(uint32_t _enable) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + enable(_enable & ((1U << 1)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_pmu_mask_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + enable(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(enable) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_pmu_mask_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_pmu_mask_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_enable() const + { + return static_cast(enable); + } + CONSTEXPR npu_op_pmu_mask_t& set_enable(uint32_t value) + { + assert((value >> 1) == 0); + enable = static_cast(value & ((1U << 1)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("enable", std::to_string(enable))); + } +#endif +#endif +}; + +struct npu_set_ifm_pad_top_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t pad:7; + uint32_t reserved1:9; +#ifdef __cplusplus +public: + npu_set_ifm_pad_top_t(uint32_t _pad) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(_pad & ((1U << 7)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_pad_top_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(pad) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_pad_top_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_pad_top_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_pad() const + { + return static_cast(pad); + } + CONSTEXPR npu_set_ifm_pad_top_t& set_pad(uint32_t value) + { + assert((value >> 7) == 0); + pad = static_cast(value & ((1U << 7)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("pad", std::to_string(pad))); + } +#endif +#endif +}; + +struct npu_set_ifm_pad_left_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t pad:7; + uint32_t reserved1:9; +#ifdef __cplusplus +public: + npu_set_ifm_pad_left_t(uint32_t _pad) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(_pad & ((1U << 7)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_pad_left_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(pad) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_pad_left_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_pad_left_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_pad() const + { + return static_cast(pad); + } + CONSTEXPR npu_set_ifm_pad_left_t& set_pad(uint32_t value) + { + assert((value >> 7) == 0); + pad = static_cast(value & ((1U << 7)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("pad", std::to_string(pad))); + } +#endif +#endif +}; + +struct npu_set_ifm_pad_right_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t pad:8; + uint32_t reserved1:8; +#ifdef __cplusplus +public: + npu_set_ifm_pad_right_t(uint32_t _pad) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(_pad & ((1U << 8)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_pad_right_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(pad) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_pad_right_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_pad_right_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_pad() const + { + return static_cast(pad); + } + CONSTEXPR npu_set_ifm_pad_right_t& set_pad(uint32_t value) + { + assert((value >> 8) == 0); + pad = static_cast(value & ((1U << 8)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("pad", std::to_string(pad))); + } +#endif +#endif +}; + +struct npu_set_ifm_pad_bottom_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t pad:8; + uint32_t reserved1:8; +#ifdef __cplusplus +public: + npu_set_ifm_pad_bottom_t(uint32_t _pad) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(_pad & ((1U << 8)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_pad_bottom_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(pad) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_pad_bottom_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_pad_bottom_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_pad() const + { + return static_cast(pad); + } + CONSTEXPR npu_set_ifm_pad_bottom_t& set_pad(uint32_t value) + { + assert((value >> 8) == 0); + pad = static_cast(value & ((1U << 8)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("pad", std::to_string(pad))); + } +#endif +#endif +}; + +struct npu_set_ifm_depth_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t depth_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm_depth_m1_t(uint32_t _depth_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(_depth_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm_depth_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(depth_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_depth_m1() const + { + return static_cast(depth_m1); + } + CONSTEXPR npu_set_ifm_depth_m1_t& set_depth_m1(uint32_t value) + { + assert((value >> 16) == 0); + depth_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("depth_m1", std::to_string(depth_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm_precision_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t activation_type:1; + uint32_t reserved1:1; + uint32_t activation_precision:2; + uint32_t reserved2:2; + uint32_t activation_format:2; + uint32_t scale_mode:2; + uint32_t reserved3:4; + uint32_t round_mode:2; +#ifdef __cplusplus +public: + npu_set_ifm_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format, NPU_NAMESPACE::ifm_scale_mode _scale_mode, NPU_NAMESPACE::round_mode _round_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(static_cast(_activation_type) & ((1U << 1)-1)), + reserved1(0), + activation_precision(static_cast(_activation_precision) & ((1U << 2)-1)), + reserved2(0), + activation_format(static_cast(_activation_format) & ((1U << 2)-1)), + scale_mode(static_cast(_scale_mode) & ((1U << 2)-1)), + reserved3(0), + round_mode(static_cast(_round_mode) & ((1U << 2)-1)) + {} + CONSTEXPR npu_set_ifm_precision_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(0), + reserved1(0), + activation_precision(0), + reserved2(0), + activation_format(0), + scale_mode(0), + reserved3(0), + round_mode(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(activation_type) << 16; + word |= uint32_t(activation_precision) << 18; + word |= uint32_t(activation_format) << 22; + word |= uint32_t(scale_mode) << 24; + word |= uint32_t(round_mode) << 30; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const + { + return static_cast(activation_type); + } + CONSTEXPR npu_set_ifm_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value) + { + activation_type = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const + { + return static_cast(activation_precision); + } + CONSTEXPR npu_set_ifm_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value) + { + activation_precision = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const + { + return static_cast(activation_format); + } + CONSTEXPR npu_set_ifm_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value) + { + activation_format = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::ifm_scale_mode get_scale_mode() const + { + return static_cast(scale_mode); + } + CONSTEXPR npu_set_ifm_precision_t& set_scale_mode(NPU_NAMESPACE::ifm_scale_mode value) + { + scale_mode = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::round_mode get_round_mode() const + { + return static_cast(round_mode); + } + CONSTEXPR npu_set_ifm_precision_t& set_round_mode(NPU_NAMESPACE::round_mode value) + { + round_mode = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****"))); + fields.push_back(std::make_pair("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****"))); + fields.push_back(std::make_pair("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****"))); + fields.push_back(std::make_pair("scale_mode", (scale_mode < (sizeof(ifm_scale_mode_str)/sizeof(ifm_scale_mode_str[0])) ? ifm_scale_mode_str[scale_mode] : "****"))); + fields.push_back(std::make_pair("round_mode", (round_mode < (sizeof(round_mode_str)/sizeof(round_mode_str[0])) ? round_mode_str[round_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ifm_upscale_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t mode:2; + uint32_t reserved1:14; +#ifdef __cplusplus +public: + npu_set_ifm_upscale_t(NPU_NAMESPACE::ifm_upscale_mode _mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mode(static_cast(_mode) & ((1U << 2)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_upscale_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mode(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(mode) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_upscale_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_upscale_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::ifm_upscale_mode get_mode() const + { + return static_cast(mode); + } + CONSTEXPR npu_set_ifm_upscale_t& set_mode(NPU_NAMESPACE::ifm_upscale_mode value) + { + mode = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("mode", (mode < (sizeof(ifm_upscale_mode_str)/sizeof(ifm_upscale_mode_str[0])) ? ifm_upscale_mode_str[mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ifm_zero_point_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t zero_point:16; +#ifdef __cplusplus +public: + npu_set_ifm_zero_point_t(uint32_t _zero_point) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(_zero_point & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm_zero_point_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(zero_point) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_zero_point() const + { + return static_cast(zero_point); + } + CONSTEXPR npu_set_ifm_zero_point_t& set_zero_point(uint32_t value) + { + assert((value >> 16) == 0); + zero_point = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("zero_point", std::to_string(zero_point))); + } +#endif +#endif +}; + +struct npu_set_ifm_width0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm_width0_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm_width0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_ifm_width0_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 16) == 0); + width_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm_height0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm_height0_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm_height0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ifm_height0_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm_height1_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm_height1_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm_height1_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ifm_height1_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm_ib_end_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t ib_end:6; + uint32_t reserved1:10; +#ifdef __cplusplus +public: + npu_set_ifm_ib_end_t(uint32_t _ib_end) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_IB_END)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + ib_end(_ib_end & ((1U << 6)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_ib_end_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_IB_END)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + ib_end(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_IB_END) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_IB_END); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(ib_end) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_ib_end_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_ib_end_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_ib_end() const + { + return static_cast(ib_end); + } + CONSTEXPR npu_set_ifm_ib_end_t& set_ib_end(uint32_t value) + { + assert((value >> 6) == 0); + ib_end = static_cast(value & ((1U << 6)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("ib_end", std::to_string(ib_end))); + } +#endif +#endif +}; + +struct npu_set_ifm_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_ifm_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_ifm_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_ofm_width_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_width_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_width_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_ofm_width_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 16) == 0); + width_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_height_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_height_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_height_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ofm_height_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_depth_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t depth_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_depth_m1_t(uint32_t _depth_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(_depth_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_depth_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(depth_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_depth_m1() const + { + return static_cast(depth_m1); + } + CONSTEXPR npu_set_ofm_depth_m1_t& set_depth_m1(uint32_t value) + { + assert((value >> 16) == 0); + depth_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("depth_m1", std::to_string(depth_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_precision_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t activation_type:1; + uint32_t activation_precision:2; + uint32_t reserved1:3; + uint32_t activation_format:2; + uint32_t scale_mode:1; + uint32_t reserved2:5; + uint32_t round_mode:2; +#ifdef __cplusplus +public: + npu_set_ofm_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format, NPU_NAMESPACE::ofm_scale_mode _scale_mode, NPU_NAMESPACE::round_mode _round_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(static_cast(_activation_type) & ((1U << 1)-1)), + activation_precision(static_cast(_activation_precision) & ((1U << 2)-1)), + reserved1(0), + activation_format(static_cast(_activation_format) & ((1U << 2)-1)), + scale_mode(static_cast(_scale_mode) & ((1U << 1)-1)), + reserved2(0), + round_mode(static_cast(_round_mode) & ((1U << 2)-1)) + {} + CONSTEXPR npu_set_ofm_precision_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(0), + activation_precision(0), + reserved1(0), + activation_format(0), + scale_mode(0), + reserved2(0), + round_mode(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(activation_type) << 16; + word |= uint32_t(activation_precision) << 17; + word |= uint32_t(activation_format) << 22; + word |= uint32_t(scale_mode) << 24; + word |= uint32_t(round_mode) << 30; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const + { + return static_cast(activation_type); + } + CONSTEXPR npu_set_ofm_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value) + { + activation_type = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const + { + return static_cast(activation_precision); + } + CONSTEXPR npu_set_ofm_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value) + { + activation_precision = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const + { + return static_cast(activation_format); + } + CONSTEXPR npu_set_ofm_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value) + { + activation_format = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::ofm_scale_mode get_scale_mode() const + { + return static_cast(scale_mode); + } + CONSTEXPR npu_set_ofm_precision_t& set_scale_mode(NPU_NAMESPACE::ofm_scale_mode value) + { + scale_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::round_mode get_round_mode() const + { + return static_cast(round_mode); + } + CONSTEXPR npu_set_ofm_precision_t& set_round_mode(NPU_NAMESPACE::round_mode value) + { + round_mode = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****"))); + fields.push_back(std::make_pair("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****"))); + fields.push_back(std::make_pair("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****"))); + fields.push_back(std::make_pair("scale_mode", (scale_mode < (sizeof(ofm_scale_mode_str)/sizeof(ofm_scale_mode_str[0])) ? ofm_scale_mode_str[scale_mode] : "****"))); + fields.push_back(std::make_pair("round_mode", (round_mode < (sizeof(round_mode_str)/sizeof(round_mode_str[0])) ? round_mode_str[round_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ofm_blk_width_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:6; + uint32_t reserved1:10; +#ifdef __cplusplus +public: + npu_set_ofm_blk_width_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 6)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ofm_blk_width_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_blk_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_blk_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_ofm_blk_width_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 6) == 0); + width_m1 = static_cast(value & ((1U << 6)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_blk_height_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:5; + uint32_t reserved1:11; +#ifdef __cplusplus +public: + npu_set_ofm_blk_height_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 5)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ofm_blk_height_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_blk_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_blk_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ofm_blk_height_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 5) == 0); + height_m1 = static_cast(value & ((1U << 5)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_blk_depth_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t depth_m1:7; + uint32_t reserved1:9; +#ifdef __cplusplus +public: + npu_set_ofm_blk_depth_m1_t(uint32_t _depth_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(_depth_m1 & ((1U << 7)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ofm_blk_depth_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(depth_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_depth_m1() const + { + return static_cast(depth_m1); + } + CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_depth_m1(uint32_t value) + { + assert((value >> 7) == 0); + depth_m1 = static_cast(value & ((1U << 7)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("depth_m1", std::to_string(depth_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_zero_point_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t zero_point:16; +#ifdef __cplusplus +public: + npu_set_ofm_zero_point_t(uint32_t _zero_point) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(_zero_point & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_zero_point_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(zero_point) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_zero_point() const + { + return static_cast(zero_point); + } + CONSTEXPR npu_set_ofm_zero_point_t& set_zero_point(uint32_t value) + { + assert((value >> 16) == 0); + zero_point = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("zero_point", std::to_string(zero_point))); + } +#endif +#endif +}; + +struct npu_set_ofm_width0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_width0_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_width0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_ofm_width0_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 16) == 0); + width_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_height0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_height0_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_height0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ofm_height0_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_height1_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_height1_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_height1_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ofm_height1_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_ofm_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ofm_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_ofm_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_kernel_width_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:16; +#ifdef __cplusplus +public: + npu_set_kernel_width_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_kernel_width_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_kernel_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_kernel_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_kernel_width_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 16) == 0); + width_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_kernel_height_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_kernel_height_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_kernel_height_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_kernel_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_kernel_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_kernel_height_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_kernel_stride_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t stride_x_lsb:1; + uint32_t stride_y_lsb:1; + uint32_t weight_order:1; + uint32_t dilation_x:1; + uint32_t dilation_y:1; + uint32_t decomposition:1; + uint32_t stride_x_msb:1; + uint32_t reserved1:2; + uint32_t stride_y_msb:1; + uint32_t reserved2:6; +#ifdef __cplusplus +public: + npu_set_kernel_stride_t(uint32_t _stride_x_lsb, uint32_t _stride_y_lsb, NPU_NAMESPACE::weight_order _weight_order, NPU_NAMESPACE::kernel_dilation _dilation_x, NPU_NAMESPACE::kernel_dilation _dilation_y, NPU_NAMESPACE::kernel_decomposition _decomposition, uint32_t _stride_x_msb, uint32_t _stride_y_msb) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + stride_x_lsb(_stride_x_lsb & ((1U << 1)-1)), + stride_y_lsb(_stride_y_lsb & ((1U << 1)-1)), + weight_order(static_cast(_weight_order) & ((1U << 1)-1)), + dilation_x(static_cast(_dilation_x) & ((1U << 1)-1)), + dilation_y(static_cast(_dilation_y) & ((1U << 1)-1)), + decomposition(static_cast(_decomposition) & ((1U << 1)-1)), + stride_x_msb(_stride_x_msb & ((1U << 1)-1)), + reserved1(0), + stride_y_msb(_stride_y_msb & ((1U << 1)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_kernel_stride_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + stride_x_lsb(0), + stride_y_lsb(0), + weight_order(0), + dilation_x(0), + dilation_y(0), + decomposition(0), + stride_x_msb(0), + reserved1(0), + stride_y_msb(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(stride_x_lsb) << 16; + word |= uint32_t(stride_y_lsb) << 17; + word |= uint32_t(weight_order) << 18; + word |= uint32_t(dilation_x) << 19; + word |= uint32_t(dilation_y) << 20; + word |= uint32_t(decomposition) << 21; + word |= uint32_t(stride_x_msb) << 22; + word |= uint32_t(stride_y_msb) << 25; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_kernel_stride_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_kernel_stride_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_stride_x_lsb() const + { + return static_cast(stride_x_lsb); + } + CONSTEXPR npu_set_kernel_stride_t& set_stride_x_lsb(uint32_t value) + { + assert((value >> 1) == 0); + stride_x_lsb = static_cast(value & ((1U << 1)-1)); + return *this; + } + CONSTEXPR uint32_t get_stride_y_lsb() const + { + return static_cast(stride_y_lsb); + } + CONSTEXPR npu_set_kernel_stride_t& set_stride_y_lsb(uint32_t value) + { + assert((value >> 1) == 0); + stride_y_lsb = static_cast(value & ((1U << 1)-1)); + return *this; + } + CONSTEXPR NPU_NAMESPACE::weight_order get_weight_order() const + { + return static_cast(weight_order); + } + CONSTEXPR npu_set_kernel_stride_t& set_weight_order(NPU_NAMESPACE::weight_order value) + { + weight_order = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::kernel_dilation get_dilation_x() const + { + return static_cast(dilation_x); + } + CONSTEXPR npu_set_kernel_stride_t& set_dilation_x(NPU_NAMESPACE::kernel_dilation value) + { + dilation_x = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::kernel_dilation get_dilation_y() const + { + return static_cast(dilation_y); + } + CONSTEXPR npu_set_kernel_stride_t& set_dilation_y(NPU_NAMESPACE::kernel_dilation value) + { + dilation_y = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::kernel_decomposition get_decomposition() const + { + return static_cast(decomposition); + } + CONSTEXPR npu_set_kernel_stride_t& set_decomposition(NPU_NAMESPACE::kernel_decomposition value) + { + decomposition = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR uint32_t get_stride_x_msb() const + { + return static_cast(stride_x_msb); + } + CONSTEXPR npu_set_kernel_stride_t& set_stride_x_msb(uint32_t value) + { + assert((value >> 1) == 0); + stride_x_msb = static_cast(value & ((1U << 1)-1)); + return *this; + } + CONSTEXPR uint32_t get_stride_y_msb() const + { + return static_cast(stride_y_msb); + } + CONSTEXPR npu_set_kernel_stride_t& set_stride_y_msb(uint32_t value) + { + assert((value >> 1) == 0); + stride_y_msb = static_cast(value & ((1U << 1)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("stride_x_lsb", std::to_string(stride_x_lsb))); + fields.push_back(std::make_pair("stride_y_lsb", std::to_string(stride_y_lsb))); + fields.push_back(std::make_pair("weight_order", (weight_order < (sizeof(weight_order_str)/sizeof(weight_order_str[0])) ? weight_order_str[weight_order] : "****"))); + fields.push_back(std::make_pair("dilation_x", (dilation_x < (sizeof(kernel_dilation_str)/sizeof(kernel_dilation_str[0])) ? kernel_dilation_str[dilation_x] : "****"))); + fields.push_back(std::make_pair("dilation_y", (dilation_y < (sizeof(kernel_dilation_str)/sizeof(kernel_dilation_str[0])) ? kernel_dilation_str[dilation_y] : "****"))); + fields.push_back(std::make_pair("decomposition", (decomposition < (sizeof(kernel_decomposition_str)/sizeof(kernel_decomposition_str[0])) ? kernel_decomposition_str[decomposition] : "****"))); + fields.push_back(std::make_pair("stride_x_msb", std::to_string(stride_x_msb))); + fields.push_back(std::make_pair("stride_y_msb", std::to_string(stride_y_msb))); + } +#endif +#endif +}; + +struct npu_set_parallel_mode_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t parallel_mode:1; + uint32_t reserved1:15; +#ifdef __cplusplus +public: + npu_set_parallel_mode_t(NPU_NAMESPACE::parallel_mode _parallel_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_PARALLEL_MODE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + parallel_mode(static_cast(_parallel_mode) & ((1U << 1)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_parallel_mode_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_PARALLEL_MODE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + parallel_mode(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_PARALLEL_MODE) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_PARALLEL_MODE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(parallel_mode) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_parallel_mode_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_parallel_mode_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::parallel_mode get_parallel_mode() const + { + return static_cast(parallel_mode); + } + CONSTEXPR npu_set_parallel_mode_t& set_parallel_mode(NPU_NAMESPACE::parallel_mode value) + { + parallel_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("parallel_mode", (parallel_mode < (sizeof(parallel_mode_str)/sizeof(parallel_mode_str[0])) ? parallel_mode_str[parallel_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_acc_format_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t acc_format:2; + uint32_t reserved1:14; +#ifdef __cplusplus +public: + npu_set_acc_format_t(NPU_NAMESPACE::acc_format _acc_format) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + acc_format(static_cast(_acc_format) & ((1U << 2)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_acc_format_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + acc_format(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(acc_format) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_acc_format_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_acc_format_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::acc_format get_acc_format() const + { + return static_cast(acc_format); + } + CONSTEXPR npu_set_acc_format_t& set_acc_format(NPU_NAMESPACE::acc_format value) + { + acc_format = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("acc_format", (acc_format < (sizeof(acc_format_str)/sizeof(acc_format_str[0])) ? acc_format_str[acc_format] : "****"))); + } +#endif +#endif +}; + +struct npu_set_activation_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t activation_function:5; + uint32_t reserved1:7; + uint32_t activation_clip_range:3; + uint32_t reserved2:1; +#ifdef __cplusplus +public: + npu_set_activation_t(NPU_NAMESPACE::activation_function _activation_function, NPU_NAMESPACE::activation_clip_range _activation_clip_range) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_function(static_cast(_activation_function) & ((1U << 5)-1)), + reserved1(0), + activation_clip_range(static_cast(_activation_clip_range) & ((1U << 3)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_activation_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_function(0), + reserved1(0), + activation_clip_range(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(activation_function) << 16; + word |= uint32_t(activation_clip_range) << 28; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_activation_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_activation_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_function get_activation_function() const + { + return static_cast(activation_function); + } + CONSTEXPR npu_set_activation_t& set_activation_function(NPU_NAMESPACE::activation_function value) + { + activation_function = static_cast(value) & ((1U << 5)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_clip_range get_activation_clip_range() const + { + return static_cast(activation_clip_range); + } + CONSTEXPR npu_set_activation_t& set_activation_clip_range(NPU_NAMESPACE::activation_clip_range value) + { + activation_clip_range = static_cast(value) & ((1U << 3)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("activation_function", (activation_function < (sizeof(activation_function_str)/sizeof(activation_function_str[0])) ? activation_function_str[activation_function] : "****"))); + fields.push_back(std::make_pair("activation_clip_range", (activation_clip_range < (sizeof(activation_clip_range_str)/sizeof(activation_clip_range_str[0])) ? activation_clip_range_str[activation_clip_range] : "****"))); + } +#endif +#endif +}; + +struct npu_set_activation_min_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t clip_boundary:16; +#ifdef __cplusplus +public: + npu_set_activation_min_t(uint32_t _clip_boundary) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + clip_boundary(_clip_boundary & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_activation_min_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + clip_boundary(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(clip_boundary) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_activation_min_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_activation_min_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_clip_boundary() const + { + return static_cast(clip_boundary); + } + CONSTEXPR npu_set_activation_min_t& set_clip_boundary(uint32_t value) + { + assert((value >> 16) == 0); + clip_boundary = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("clip_boundary", std::to_string(clip_boundary))); + } +#endif +#endif +}; + +struct npu_set_activation_max_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t clip_boundary:16; +#ifdef __cplusplus +public: + npu_set_activation_max_t(uint32_t _clip_boundary) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + clip_boundary(_clip_boundary & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_activation_max_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + clip_boundary(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(clip_boundary) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_activation_max_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_activation_max_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_clip_boundary() const + { + return static_cast(clip_boundary); + } + CONSTEXPR npu_set_activation_max_t& set_clip_boundary(uint32_t value) + { + assert((value >> 16) == 0); + clip_boundary = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("clip_boundary", std::to_string(clip_boundary))); + } +#endif +#endif +}; + +struct npu_set_weight_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_weight_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_weight_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_weight_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_weight_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_weight_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_scale_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_scale_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_scale_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_scale_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_scale_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_scale_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_ab_start_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t ab_start:6; + uint32_t reserved1:10; +#ifdef __cplusplus +public: + npu_set_ab_start_t(uint32_t _ab_start) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_AB_START)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + ab_start(_ab_start & ((1U << 6)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ab_start_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_AB_START)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + ab_start(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_AB_START) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_AB_START); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(ab_start) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ab_start_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ab_start_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_ab_start() const + { + return static_cast(ab_start); + } + CONSTEXPR npu_set_ab_start_t& set_ab_start(uint32_t value) + { + assert((value >> 6) == 0); + ab_start = static_cast(value & ((1U << 6)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("ab_start", std::to_string(ab_start))); + } +#endif +#endif +}; + +struct npu_set_blockdep_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t blockdep:2; + uint32_t reserved1:14; +#ifdef __cplusplus +public: + npu_set_blockdep_t(uint32_t _blockdep) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + blockdep(_blockdep & ((1U << 2)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_blockdep_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + blockdep(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(blockdep) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_blockdep_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_blockdep_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_blockdep() const + { + return static_cast(blockdep); + } + CONSTEXPR npu_set_blockdep_t& set_blockdep(uint32_t value) + { + assert((value >> 2) == 0); + blockdep = static_cast(value & ((1U << 2)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("blockdep", std::to_string(blockdep))); + } +#endif +#endif +}; + +struct npu_set_dma0_src_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:5; + uint32_t region_mode:1; + uint32_t stride_mode:2; + uint32_t reserved2:5; +#ifdef __cplusplus +public: + npu_set_dma0_src_region_t(uint32_t _region, NPU_NAMESPACE::dma_region_mode _region_mode, NPU_NAMESPACE::dma_stride_mode _stride_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0), + region_mode(static_cast(_region_mode) & ((1U << 1)-1)), + stride_mode(static_cast(_stride_mode) & ((1U << 2)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_dma0_src_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0), + region_mode(0), + stride_mode(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + word |= uint32_t(region_mode) << 24; + word |= uint32_t(stride_mode) << 25; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_dma0_src_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_dma0_src_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_dma0_src_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_region_mode get_region_mode() const + { + return static_cast(region_mode); + } + CONSTEXPR npu_set_dma0_src_region_t& set_region_mode(NPU_NAMESPACE::dma_region_mode value) + { + region_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_stride_mode get_stride_mode() const + { + return static_cast(stride_mode); + } + CONSTEXPR npu_set_dma0_src_region_t& set_stride_mode(NPU_NAMESPACE::dma_stride_mode value) + { + stride_mode = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + fields.push_back(std::make_pair("region_mode", (region_mode < (sizeof(dma_region_mode_str)/sizeof(dma_region_mode_str[0])) ? dma_region_mode_str[region_mode] : "****"))); + fields.push_back(std::make_pair("stride_mode", (stride_mode < (sizeof(dma_stride_mode_str)/sizeof(dma_stride_mode_str[0])) ? dma_stride_mode_str[stride_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_dma0_dst_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:5; + uint32_t region_mode:1; + uint32_t stride_mode:2; + uint32_t reserved2:5; +#ifdef __cplusplus +public: + npu_set_dma0_dst_region_t(uint32_t _region, NPU_NAMESPACE::dma_region_mode _region_mode, NPU_NAMESPACE::dma_stride_mode _stride_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0), + region_mode(static_cast(_region_mode) & ((1U << 1)-1)), + stride_mode(static_cast(_stride_mode) & ((1U << 2)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_dma0_dst_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0), + region_mode(0), + stride_mode(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + word |= uint32_t(region_mode) << 24; + word |= uint32_t(stride_mode) << 25; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_dma0_dst_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_dma0_dst_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_dma0_dst_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_region_mode get_region_mode() const + { + return static_cast(region_mode); + } + CONSTEXPR npu_set_dma0_dst_region_t& set_region_mode(NPU_NAMESPACE::dma_region_mode value) + { + region_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_stride_mode get_stride_mode() const + { + return static_cast(stride_mode); + } + CONSTEXPR npu_set_dma0_dst_region_t& set_stride_mode(NPU_NAMESPACE::dma_stride_mode value) + { + stride_mode = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + fields.push_back(std::make_pair("region_mode", (region_mode < (sizeof(dma_region_mode_str)/sizeof(dma_region_mode_str[0])) ? dma_region_mode_str[region_mode] : "****"))); + fields.push_back(std::make_pair("stride_mode", (stride_mode < (sizeof(dma_stride_mode_str)/sizeof(dma_stride_mode_str[0])) ? dma_stride_mode_str[stride_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_dma0_size0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t size:16; +#ifdef __cplusplus +public: + npu_set_dma0_size0_t(uint32_t _size) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + size(_size & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_dma0_size0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + size(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(size) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_dma0_size0_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_dma0_size0_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_size() const + { + return static_cast(size); + } + CONSTEXPR npu_set_dma0_size0_t& set_size(uint32_t value) + { + assert((value >> 16) == 0); + size = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("size", std::to_string(size))); + } +#endif +#endif +}; + +struct npu_set_dma0_size1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t size:16; +#ifdef __cplusplus +public: + npu_set_dma0_size1_t(uint32_t _size) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + size(_size & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_dma0_size1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + size(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(size) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_dma0_size1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_dma0_size1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_size() const + { + return static_cast(size); + } + CONSTEXPR npu_set_dma0_size1_t& set_size(uint32_t value) + { + assert((value >> 16) == 0); + size = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("size", std::to_string(size))); + } +#endif +#endif +}; + +struct npu_set_ifm2_broadcast_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t broadcast_h:1; + uint32_t broadcast_w:1; + uint32_t broadcast_c:1; + uint32_t reserved1:3; + uint32_t operand_order:1; + uint32_t broadcast_constant:1; + uint32_t reserved2:8; +#ifdef __cplusplus +public: + npu_set_ifm2_broadcast_t(NPU_NAMESPACE::broadcast_mode _broadcast_h, NPU_NAMESPACE::broadcast_mode _broadcast_w, NPU_NAMESPACE::broadcast_mode _broadcast_c, NPU_NAMESPACE::ifm2_operand_order _operand_order, NPU_NAMESPACE::broadcast_mode _broadcast_constant) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + broadcast_h(static_cast(_broadcast_h) & ((1U << 1)-1)), + broadcast_w(static_cast(_broadcast_w) & ((1U << 1)-1)), + broadcast_c(static_cast(_broadcast_c) & ((1U << 1)-1)), + reserved1(0), + operand_order(static_cast(_operand_order) & ((1U << 1)-1)), + broadcast_constant(static_cast(_broadcast_constant) & ((1U << 1)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_ifm2_broadcast_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + broadcast_h(0), + broadcast_w(0), + broadcast_c(0), + reserved1(0), + operand_order(0), + broadcast_constant(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(broadcast_h) << 16; + word |= uint32_t(broadcast_w) << 17; + word |= uint32_t(broadcast_c) << 18; + word |= uint32_t(operand_order) << 22; + word |= uint32_t(broadcast_constant) << 23; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_broadcast_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_broadcast_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_h() const + { + return static_cast(broadcast_h); + } + CONSTEXPR npu_set_ifm2_broadcast_t& set_broadcast_h(NPU_NAMESPACE::broadcast_mode value) + { + broadcast_h = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_w() const + { + return static_cast(broadcast_w); + } + CONSTEXPR npu_set_ifm2_broadcast_t& set_broadcast_w(NPU_NAMESPACE::broadcast_mode value) + { + broadcast_w = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_c() const + { + return static_cast(broadcast_c); + } + CONSTEXPR npu_set_ifm2_broadcast_t& set_broadcast_c(NPU_NAMESPACE::broadcast_mode value) + { + broadcast_c = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::ifm2_operand_order get_operand_order() const + { + return static_cast(operand_order); + } + CONSTEXPR npu_set_ifm2_broadcast_t& set_operand_order(NPU_NAMESPACE::ifm2_operand_order value) + { + operand_order = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_constant() const + { + return static_cast(broadcast_constant); + } + CONSTEXPR npu_set_ifm2_broadcast_t& set_broadcast_constant(NPU_NAMESPACE::broadcast_mode value) + { + broadcast_constant = static_cast(value) & ((1U << 1)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("broadcast_h", (broadcast_h < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_h] : "****"))); + fields.push_back(std::make_pair("broadcast_w", (broadcast_w < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_w] : "****"))); + fields.push_back(std::make_pair("broadcast_c", (broadcast_c < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_c] : "****"))); + fields.push_back(std::make_pair("operand_order", (operand_order < (sizeof(ifm2_operand_order_str)/sizeof(ifm2_operand_order_str[0])) ? ifm2_operand_order_str[operand_order] : "****"))); + fields.push_back(std::make_pair("broadcast_constant", (broadcast_constant < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_constant] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ifm2_scalar_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t scalar:16; +#ifdef __cplusplus +public: + npu_set_ifm2_scalar_t(uint32_t _scalar) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_SCALAR)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + scalar(_scalar & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm2_scalar_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_SCALAR)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + scalar(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_SCALAR) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_SCALAR); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(scalar) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_scalar_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_scalar_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_scalar() const + { + return static_cast(scalar); + } + CONSTEXPR npu_set_ifm2_scalar_t& set_scalar(uint32_t value) + { + assert((value >> 16) == 0); + scalar = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("scalar", std::to_string(scalar))); + } +#endif +#endif +}; + +struct npu_set_ifm2_precision_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t activation_type:1; + uint32_t reserved1:1; + uint32_t activation_precision:2; + uint32_t reserved2:2; + uint32_t activation_format:2; + uint32_t reserved3:8; +#ifdef __cplusplus +public: + npu_set_ifm2_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(static_cast(_activation_type) & ((1U << 1)-1)), + reserved1(0), + activation_precision(static_cast(_activation_precision) & ((1U << 2)-1)), + reserved2(0), + activation_format(static_cast(_activation_format) & ((1U << 2)-1)), + reserved3(0) + {} + CONSTEXPR npu_set_ifm2_precision_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(0), + reserved1(0), + activation_precision(0), + reserved2(0), + activation_format(0), + reserved3(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(activation_type) << 16; + word |= uint32_t(activation_precision) << 18; + word |= uint32_t(activation_format) << 22; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const + { + return static_cast(activation_type); + } + CONSTEXPR npu_set_ifm2_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value) + { + activation_type = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const + { + return static_cast(activation_precision); + } + CONSTEXPR npu_set_ifm2_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value) + { + activation_precision = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const + { + return static_cast(activation_format); + } + CONSTEXPR npu_set_ifm2_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value) + { + activation_format = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****"))); + fields.push_back(std::make_pair("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****"))); + fields.push_back(std::make_pair("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ifm2_zero_point_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t zero_point:16; +#ifdef __cplusplus +public: + npu_set_ifm2_zero_point_t(uint32_t _zero_point) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(_zero_point & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm2_zero_point_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(zero_point) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_zero_point() const + { + return static_cast(zero_point); + } + CONSTEXPR npu_set_ifm2_zero_point_t& set_zero_point(uint32_t value) + { + assert((value >> 16) == 0); + zero_point = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("zero_point", std::to_string(zero_point))); + } +#endif +#endif +}; + +struct npu_set_ifm2_width0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm2_width0_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm2_width0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_ifm2_width0_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 16) == 0); + width_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm2_height0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm2_height0_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm2_height0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ifm2_height0_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm2_height1_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm2_height1_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm2_height1_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ifm2_height1_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm2_ib_start_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t ib_start:6; + uint32_t reserved1:10; +#ifdef __cplusplus +public: + npu_set_ifm2_ib_start_t(uint32_t _ib_start) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_IB_START)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + ib_start(_ib_start & ((1U << 6)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm2_ib_start_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_IB_START)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + ib_start(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_IB_START) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_IB_START); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(ib_start) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_ib_start_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_ib_start_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_ib_start() const + { + return static_cast(ib_start); + } + CONSTEXPR npu_set_ifm2_ib_start_t& set_ib_start(uint32_t value) + { + assert((value >> 6) == 0); + ib_start = static_cast(value & ((1U << 6)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("ib_start", std::to_string(ib_start))); + } +#endif +#endif +}; + +struct npu_set_ifm2_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_ifm2_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm2_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_ifm2_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_ifm_base0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_base0_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_base0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_base0_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_base1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_base1_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_base1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_base1_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_base2_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_base2_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_base2_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_base2_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_base3_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_base3_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_base3_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_base3_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_stride_x_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_stride_x_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_stride_x_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_stride_x_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_stride_y_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_stride_y_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_stride_y_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_stride_y_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_stride_c_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_stride_c_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_stride_c_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_stride_c_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_base0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_base0_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_base0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_base0_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_base1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_base1_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_base1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_base1_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_base2_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_base2_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_base2_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_base2_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_base3_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_base3_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_base3_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_base3_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_stride_x_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_stride_x_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_stride_x_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_stride_x_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_stride_y_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_stride_y_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_stride_y_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_stride_y_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_stride_c_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_stride_c_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_stride_c_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_stride_c_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_weight_base_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_weight_base_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_weight_base_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_weight_base_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_weight_length_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t length:32; +#ifdef __cplusplus +public: + npu_set_weight_length_t(uint32_t _length) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(_length) + {} + CONSTEXPR npu_set_weight_length_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(length) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_weight_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_weight_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_length() const + { + return static_cast(length); + } + CONSTEXPR npu_set_weight_length_t& set_length(uint32_t value) + { + length = value; + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("length", std::to_string(length))); + } +#endif +#endif +}; + +struct npu_set_scale_base_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_scale_base_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_scale_base_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_scale_base_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_scale_length_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t length:20; + uint32_t reserved2:12; +#ifdef __cplusplus +public: + npu_set_scale_length_t(uint32_t _length) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(_length & ((1U << 20)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_scale_length_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(length) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_scale_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_scale_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_length() const + { + return static_cast(length); + } + CONSTEXPR npu_set_scale_length_t& set_length(uint32_t value) + { + assert((value >> 20) == 0); + length = value & ((1U << 20)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("length", std::to_string(length))); + } +#endif +#endif +}; + +struct npu_set_ofm_scale_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t shift:6; + uint32_t reserved1:10; + uint32_t scale:32; +#ifdef __cplusplus +public: + npu_set_ofm_scale_t(uint32_t _shift, uint32_t _scale) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + shift(_shift & ((1U << 6)-1)), + reserved1(0), + scale(_scale) + {} + CONSTEXPR npu_set_ofm_scale_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + shift(0), + reserved1(0), + scale(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(shift) << 16; + word |= uint64_t(scale) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_shift() const + { + return static_cast(shift); + } + CONSTEXPR npu_set_ofm_scale_t& set_shift(uint32_t value) + { + assert((value >> 6) == 0); + shift = static_cast(value & ((1U << 6)-1)); + return *this; + } + CONSTEXPR uint32_t get_scale() const + { + return static_cast(scale); + } + CONSTEXPR npu_set_ofm_scale_t& set_scale(uint32_t value) + { + scale = value; + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("shift", std::to_string(shift))); + fields.push_back(std::make_pair("scale", std::to_string(scale))); + } +#endif +#endif +}; + +struct npu_set_opa_scale_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t shift:6; + uint32_t reserved1:10; + uint32_t scale:32; +#ifdef __cplusplus +public: + npu_set_opa_scale_t(uint32_t _shift, uint32_t _scale) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPA_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + shift(_shift & ((1U << 6)-1)), + reserved1(0), + scale(_scale) + {} + CONSTEXPR npu_set_opa_scale_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPA_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + shift(0), + reserved1(0), + scale(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPA_SCALE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPA_SCALE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(shift) << 16; + word |= uint64_t(scale) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_opa_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_opa_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_shift() const + { + return static_cast(shift); + } + CONSTEXPR npu_set_opa_scale_t& set_shift(uint32_t value) + { + assert((value >> 6) == 0); + shift = static_cast(value & ((1U << 6)-1)); + return *this; + } + CONSTEXPR uint32_t get_scale() const + { + return static_cast(scale); + } + CONSTEXPR npu_set_opa_scale_t& set_scale(uint32_t value) + { + scale = value; + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("shift", std::to_string(shift))); + fields.push_back(std::make_pair("scale", std::to_string(scale))); + } +#endif +#endif +}; + +struct npu_set_opb_scale_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t scale:16; + uint32_t reserved2:16; +#ifdef __cplusplus +public: + npu_set_opb_scale_t(uint32_t _scale) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPB_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + scale(_scale & ((1U << 16)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_opb_scale_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPB_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + scale(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPB_SCALE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OPB_SCALE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(scale) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_opb_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_opb_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_scale() const + { + return static_cast(scale); + } + CONSTEXPR npu_set_opb_scale_t& set_scale(uint32_t value) + { + assert((value >> 16) == 0); + scale = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("scale", std::to_string(scale))); + } +#endif +#endif +}; + +struct npu_set_dma0_src_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_src_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_src_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_src_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_dst_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_dst_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_dst_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_dst_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_len_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_len_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_len_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_len_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_skip0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_skip0_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_skip0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP0) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_skip0_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_skip1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_skip1_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_skip1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP1) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SKIP1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_skip1_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_base0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_base0_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_base0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_base0_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_base1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_base1_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_base1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_base1_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_base2_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_base2_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_base2_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_base2_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_base3_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_base3_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_base3_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_base3_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_stride_x_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_stride_x_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_stride_x_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_stride_x_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_stride_y_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_stride_y_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_stride_y_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_stride_y_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_stride_c_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_stride_c_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_stride_c_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_stride_c_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_weight1_base_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_weight1_base_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_weight1_base_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_weight1_base_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_weight1_length_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t length:32; +#ifdef __cplusplus +public: + npu_set_weight1_length_t(uint32_t _length) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(_length) + {} + CONSTEXPR npu_set_weight1_length_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(length) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_weight1_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_weight1_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_length() const + { + return static_cast(length); + } + CONSTEXPR npu_set_weight1_length_t& set_length(uint32_t value) + { + length = value; + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("length", std::to_string(length))); + } +#endif +#endif +}; + +struct npu_set_scale1_base_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_scale1_base_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_scale1_base_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_BASE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_BASE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_scale1_base_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_scale1_length_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t length:20; + uint32_t reserved2:12; +#ifdef __cplusplus +public: + npu_set_scale1_length_t(uint32_t _length) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(_length & ((1U << 20)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_scale1_length_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_LENGTH) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE1_LENGTH); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(length) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_scale1_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_scale1_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_length() const + { + return static_cast(length); + } + CONSTEXPR npu_set_scale1_length_t& set_length(uint32_t value) + { + assert((value >> 20) == 0); + length = value & ((1U << 20)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("length", std::to_string(length))); + } +#endif +#endif +}; +#ifdef __cplusplus +}; +#endif +#define NPU_OP_STRUCTS \ + NPU_OP_(stop) \ + NPU_OP_(irq) \ + NPU_OP_(conv) \ + NPU_OP_(depthwise) \ + NPU_OP_(pool) \ + NPU_OP_(elementwise) \ + NPU_OP_(dma_start) \ + NPU_OP_(dma_wait) \ + NPU_OP_(kernel_wait) \ + NPU_OP_(pmu_mask) + +#define NPU_SET_STRUCTS \ + NPU_SET_(ifm_pad_top) \ + NPU_SET_(ifm_pad_left) \ + NPU_SET_(ifm_pad_right) \ + NPU_SET_(ifm_pad_bottom) \ + NPU_SET_(ifm_depth_m1) \ + NPU_SET_(ifm_precision) \ + NPU_SET_(ifm_upscale) \ + NPU_SET_(ifm_zero_point) \ + NPU_SET_(ifm_width0_m1) \ + NPU_SET_(ifm_height0_m1) \ + NPU_SET_(ifm_height1_m1) \ + NPU_SET_(ifm_ib_end) \ + NPU_SET_(ifm_region) \ + NPU_SET_(ofm_width_m1) \ + NPU_SET_(ofm_height_m1) \ + NPU_SET_(ofm_depth_m1) \ + NPU_SET_(ofm_precision) \ + NPU_SET_(ofm_blk_width_m1) \ + NPU_SET_(ofm_blk_height_m1) \ + NPU_SET_(ofm_blk_depth_m1) \ + NPU_SET_(ofm_zero_point) \ + NPU_SET_(ofm_width0_m1) \ + NPU_SET_(ofm_height0_m1) \ + NPU_SET_(ofm_height1_m1) \ + NPU_SET_(ofm_region) \ + NPU_SET_(kernel_width_m1) \ + NPU_SET_(kernel_height_m1) \ + NPU_SET_(kernel_stride) \ + NPU_SET_(parallel_mode) \ + NPU_SET_(acc_format) \ + NPU_SET_(activation) \ + NPU_SET_(activation_min) \ + NPU_SET_(activation_max) \ + NPU_SET_(weight_region) \ + NPU_SET_(scale_region) \ + NPU_SET_(ab_start) \ + NPU_SET_(blockdep) \ + NPU_SET_(dma0_src_region) \ + NPU_SET_(dma0_dst_region) \ + NPU_SET_(dma0_size0) \ + NPU_SET_(dma0_size1) \ + NPU_SET_(ifm2_broadcast) \ + NPU_SET_(ifm2_scalar) \ + NPU_SET_(ifm2_precision) \ + NPU_SET_(ifm2_zero_point) \ + NPU_SET_(ifm2_width0_m1) \ + NPU_SET_(ifm2_height0_m1) \ + NPU_SET_(ifm2_height1_m1) \ + NPU_SET_(ifm2_ib_start) \ + NPU_SET_(ifm2_region) \ + NPU_SET_(ifm_base0) \ + NPU_SET_(ifm_base1) \ + NPU_SET_(ifm_base2) \ + NPU_SET_(ifm_base3) \ + NPU_SET_(ifm_stride_x) \ + NPU_SET_(ifm_stride_y) \ + NPU_SET_(ifm_stride_c) \ + NPU_SET_(ofm_base0) \ + NPU_SET_(ofm_base1) \ + NPU_SET_(ofm_base2) \ + NPU_SET_(ofm_base3) \ + NPU_SET_(ofm_stride_x) \ + NPU_SET_(ofm_stride_y) \ + NPU_SET_(ofm_stride_c) \ + NPU_SET_(weight_base) \ + NPU_SET_(weight_length) \ + NPU_SET_(scale_base) \ + NPU_SET_(scale_length) \ + NPU_SET_(ofm_scale) \ + NPU_SET_(opa_scale) \ + NPU_SET_(opb_scale) \ + NPU_SET_(dma0_src) \ + NPU_SET_(dma0_dst) \ + NPU_SET_(dma0_len) \ + NPU_SET_(dma0_skip0) \ + NPU_SET_(dma0_skip1) \ + NPU_SET_(ifm2_base0) \ + NPU_SET_(ifm2_base1) \ + NPU_SET_(ifm2_base2) \ + NPU_SET_(ifm2_base3) \ + NPU_SET_(ifm2_stride_x) \ + NPU_SET_(ifm2_stride_y) \ + NPU_SET_(ifm2_stride_c) \ + NPU_SET_(weight1_base) \ + NPU_SET_(weight1_length) \ + NPU_SET_(scale1_base) \ + NPU_SET_(scale1_length) + +#define EXPAND_ACC_FORMAT(FUNC, SEP) \ + FUNC(acc_format, I32) SEP \ + FUNC(acc_format, I40) SEP \ + FUNC(acc_format, F16) + +#define EXPAND_ACTIVATION_CLIP_RANGE(FUNC, SEP) \ + FUNC(activation_clip_range, OFM_PRECISION) SEP \ + FUNC(activation_clip_range, FORCE_UINT8) SEP \ + FUNC(activation_clip_range, FORCE_INT8) SEP \ + FUNC(activation_clip_range, FORCE_INT16) + +#define EXPAND_ACTIVATION_FORMAT(FUNC, SEP) \ + FUNC(activation_format, NHWC) SEP \ + FUNC(activation_format, NHCWB16) + +#define EXPAND_ACTIVATION_FUNCTION(FUNC, SEP) \ + FUNC(activation_function, RELU) SEP \ + FUNC(activation_function, TANH) SEP \ + FUNC(activation_function, SIGMOID) SEP \ + FUNC(activation_function, TABLE_0) SEP \ + FUNC(activation_function, TABLE_1) SEP \ + FUNC(activation_function, TABLE_2) SEP \ + FUNC(activation_function, TABLE_3) SEP \ + FUNC(activation_function, TABLE_4) SEP \ + FUNC(activation_function, TABLE_5) SEP \ + FUNC(activation_function, TABLE_6) SEP \ + FUNC(activation_function, TABLE_7) + +#define EXPAND_ACTIVATION_PRECISION(FUNC, SEP) \ + FUNC(activation_precision, B8) SEP \ + FUNC(activation_precision, B16) SEP \ + FUNC(activation_precision, B32) SEP \ + FUNC(activation_precision, B64) + +#define EXPAND_ACTIVATION_TYPE(FUNC, SEP) \ + FUNC(activation_type, UNSIGNED) SEP \ + FUNC(activation_type, SIGNED) + +#define EXPAND_AXI_MEM_ENCODING(FUNC, SEP) \ + FUNC(axi_mem_encoding, DEVICE_NON_BUFFERABLE) SEP \ + FUNC(axi_mem_encoding, DEVICE_BUFFERABLE) SEP \ + FUNC(axi_mem_encoding, NORMAL_NON_CACHEABLE_NON_BUFFERABLE) SEP \ + FUNC(axi_mem_encoding, NORMAL_NON_CACHEABLE_BUFFERABLE) SEP \ + FUNC(axi_mem_encoding, WRITE_THROUGH_NO_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_THROUGH_READ_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_THROUGH_WRITE_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_THROUGH_READ_AND_WRITE_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_BACK_NO_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_BACK_READ_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_BACK_WRITE_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_BACK_READ_AND_WRITE_ALLOCATE) + +#define EXPAND_BROADCAST_MODE(FUNC, SEP) \ + FUNC(broadcast_mode, DISABLE) SEP \ + FUNC(broadcast_mode, ENABLE) + +#define EXPAND_CMD0_OPCODE(FUNC, SEP) \ + FUNC(cmd0_opcode, NPU_OP_STOP) SEP \ + FUNC(cmd0_opcode, NPU_OP_IRQ) SEP \ + FUNC(cmd0_opcode, NPU_OP_CONV) SEP \ + FUNC(cmd0_opcode, NPU_OP_DEPTHWISE) SEP \ + FUNC(cmd0_opcode, NPU_OP_POOL) SEP \ + FUNC(cmd0_opcode, NPU_OP_ELEMENTWISE) SEP \ + FUNC(cmd0_opcode, NPU_OP_DMA_START) SEP \ + FUNC(cmd0_opcode, NPU_OP_DMA_WAIT) SEP \ + FUNC(cmd0_opcode, NPU_OP_KERNEL_WAIT) SEP \ + FUNC(cmd0_opcode, NPU_OP_PMU_MASK) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_PAD_TOP) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_PAD_LEFT) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_PAD_RIGHT) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_PAD_BOTTOM) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_DEPTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_PRECISION) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_UPSCALE) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_ZERO_POINT) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_WIDTH0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_HEIGHT0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_HEIGHT1_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_IB_END) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_WIDTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_DEPTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_PRECISION) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_BLK_WIDTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_BLK_HEIGHT_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_BLK_DEPTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_ZERO_POINT) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_WIDTH0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT1_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_KERNEL_WIDTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_KERNEL_HEIGHT_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_KERNEL_STRIDE) SEP \ + FUNC(cmd0_opcode, NPU_SET_PARALLEL_MODE) SEP \ + FUNC(cmd0_opcode, NPU_SET_ACC_FORMAT) SEP \ + FUNC(cmd0_opcode, NPU_SET_ACTIVATION) SEP \ + FUNC(cmd0_opcode, NPU_SET_ACTIVATION_MIN) SEP \ + FUNC(cmd0_opcode, NPU_SET_ACTIVATION_MAX) SEP \ + FUNC(cmd0_opcode, NPU_SET_WEIGHT_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_SCALE_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_AB_START) SEP \ + FUNC(cmd0_opcode, NPU_SET_BLOCKDEP) SEP \ + FUNC(cmd0_opcode, NPU_SET_DMA0_SRC_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_DMA0_DST_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_DMA0_SIZE0) SEP \ + FUNC(cmd0_opcode, NPU_SET_DMA0_SIZE1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_BROADCAST) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_SCALAR) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_PRECISION) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_ZERO_POINT) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_WIDTH0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_HEIGHT0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_HEIGHT1_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_IB_START) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_REGION) + +#define EXPAND_CMD1_OPCODE(FUNC, SEP) \ + FUNC(cmd1_opcode, NPU_SET_IFM_BASE0) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_BASE1) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_BASE2) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_BASE3) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_X) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_Y) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_C) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_BASE0) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_BASE1) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_BASE2) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_BASE3) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_X) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_Y) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_C) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT_BASE) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT_LENGTH) SEP \ + FUNC(cmd1_opcode, NPU_SET_SCALE_BASE) SEP \ + FUNC(cmd1_opcode, NPU_SET_SCALE_LENGTH) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_SCALE) SEP \ + FUNC(cmd1_opcode, NPU_SET_OPA_SCALE) SEP \ + FUNC(cmd1_opcode, NPU_SET_OPB_SCALE) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_SRC) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_DST) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_LEN) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_SKIP0) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_SKIP1) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_BASE0) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_BASE1) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_BASE2) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_BASE3) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_X) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_Y) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_C) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT1_BASE) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT1_LENGTH) SEP \ + FUNC(cmd1_opcode, NPU_SET_SCALE1_BASE) SEP \ + FUNC(cmd1_opcode, NPU_SET_SCALE1_LENGTH) + +#define EXPAND_CMD_CTRL(FUNC, SEP) \ + FUNC(cmd_ctrl, CMD0_CTRL) SEP \ + FUNC(cmd_ctrl, CMD1_CTRL) + +#define EXPAND_CUSTOM_DMA(FUNC, SEP) \ + FUNC(custom_dma, NOT_IMPLEMENTED) SEP \ + FUNC(custom_dma, IMPLEMENTED) + +#define EXPAND_DMA_FAULT_CHANNEL(FUNC, SEP) \ + FUNC(dma_fault_channel, CMD_READ) SEP \ + FUNC(dma_fault_channel, IFM_READ) SEP \ + FUNC(dma_fault_channel, WEIGHT_READ) SEP \ + FUNC(dma_fault_channel, SBS_READ) SEP \ + FUNC(dma_fault_channel, MEM2MEM_READ) SEP \ + FUNC(dma_fault_channel, OFM_WRITE) SEP \ + FUNC(dma_fault_channel, MEM2MEM_WRITE) + +#define EXPAND_DMA_FAULT_SRC(FUNC, SEP) \ + FUNC(dma_fault_src, AXI_M0) SEP \ + FUNC(dma_fault_src, AXI_M1) + +#define EXPAND_DMA_REGION_MODE(FUNC, SEP) \ + FUNC(dma_region_mode, EXTERNAL) SEP \ + FUNC(dma_region_mode, INTERNAL) + +#define EXPAND_DMA_STRIDE_MODE(FUNC, SEP) \ + FUNC(dma_stride_mode, D1) SEP \ + FUNC(dma_stride_mode, D2) SEP \ + FUNC(dma_stride_mode, D3) + +#define EXPAND_ELEMENTWISE_MODE(FUNC, SEP) \ + FUNC(elementwise_mode, MUL) SEP \ + FUNC(elementwise_mode, ADD) SEP \ + FUNC(elementwise_mode, SUB) SEP \ + FUNC(elementwise_mode, MIN) SEP \ + FUNC(elementwise_mode, MAX) SEP \ + FUNC(elementwise_mode, LRELU) SEP \ + FUNC(elementwise_mode, ABS) SEP \ + FUNC(elementwise_mode, CLZ) SEP \ + FUNC(elementwise_mode, SHR) SEP \ + FUNC(elementwise_mode, SHL) + +#define EXPAND_IFM2_OPERAND_ORDER(FUNC, SEP) \ + FUNC(ifm2_operand_order, ORDER_B) SEP \ + FUNC(ifm2_operand_order, ORDER_A) + +#define EXPAND_IFM_SCALE_MODE(FUNC, SEP) \ + FUNC(ifm_scale_mode, OPA_OPB_16) SEP \ + FUNC(ifm_scale_mode, OPA_32) SEP \ + FUNC(ifm_scale_mode, OPB_32) + +#define EXPAND_IFM_UPSCALE_MODE(FUNC, SEP) \ + FUNC(ifm_upscale_mode, NONE) SEP \ + FUNC(ifm_upscale_mode, NEAREST) SEP \ + FUNC(ifm_upscale_mode, ZEROS) + +#define EXPAND_KERNEL_DECOMPOSITION(FUNC, SEP) \ + FUNC(kernel_decomposition, D8X8) SEP \ + FUNC(kernel_decomposition, D4X4) + +#define EXPAND_KERNEL_DILATION(FUNC, SEP) \ + FUNC(kernel_dilation, NONE) SEP \ + FUNC(kernel_dilation, X2) + +#define EXPAND_MAX_BEATS(FUNC, SEP) \ + FUNC(max_beats, B64) SEP \ + FUNC(max_beats, B128) SEP \ + FUNC(max_beats, B256) + +#define EXPAND_MEM_ATTR(FUNC, SEP) \ + FUNC(mem_attr, AXI0_OUTSTANDING_COUNTER0) SEP \ + FUNC(mem_attr, AXI0_OUTSTANDING_COUNTER1) SEP \ + FUNC(mem_attr, AXI1_OUTSTANDING_COUNTER2) SEP \ + FUNC(mem_attr, AXI1_OUTSTANDING_COUNTER3) + +#define EXPAND_OFM_SCALE_MODE(FUNC, SEP) \ + FUNC(ofm_scale_mode, PER_CHANNEL) SEP \ + FUNC(ofm_scale_mode, GLOBAL) + +#define EXPAND_PARALLEL_MODE(FUNC, SEP) \ + FUNC(parallel_mode, SINGLE_CORE) SEP \ + FUNC(parallel_mode, DUAL_CORE_DEPTH) + +#define EXPAND_PMU_AXI_CHANNEL(FUNC, SEP) \ + FUNC(pmu_axi_channel, RD_CMD) SEP \ + FUNC(pmu_axi_channel, RD_IFM) SEP \ + FUNC(pmu_axi_channel, RD_WEIGHTS) SEP \ + FUNC(pmu_axi_channel, RD_SCALE_BIAS) SEP \ + FUNC(pmu_axi_channel, RD_MEM2MEM) SEP \ + FUNC(pmu_axi_channel, WR_OFM) SEP \ + FUNC(pmu_axi_channel, WR_MEM2MEM) + +#define EXPAND_PMU_EVENT(FUNC, SEP) \ + FUNC(pmu_event, NO_EVENT) SEP \ + FUNC(pmu_event, CYCLE) SEP \ + FUNC(pmu_event, NPU_IDLE) SEP \ + FUNC(pmu_event, CC_STALLED_ON_BLOCKDEP) SEP \ + FUNC(pmu_event, CC_STALLED_ON_SHRAM_RECONFIG) SEP \ + FUNC(pmu_event, NPU_ACTIVE) SEP \ + FUNC(pmu_event, MAC_ACTIVE) SEP \ + FUNC(pmu_event, MAC_ACTIVE_8BIT) SEP \ + FUNC(pmu_event, MAC_ACTIVE_16BIT) SEP \ + FUNC(pmu_event, MAC_DPU_ACTIVE) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_WD_ACC) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_WD) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_ACC) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_IB) SEP \ + FUNC(pmu_event, MAC_ACTIVE_32BIT) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_INT_W) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_INT_ACC) SEP \ + FUNC(pmu_event, AO_ACTIVE) SEP \ + FUNC(pmu_event, AO_ACTIVE_8BIT) SEP \ + FUNC(pmu_event, AO_ACTIVE_16BIT) SEP \ + FUNC(pmu_event, AO_STALLED_BY_OFMP_OB) SEP \ + FUNC(pmu_event, AO_STALLED_BY_OFMP) SEP \ + FUNC(pmu_event, AO_STALLED_BY_OB) SEP \ + FUNC(pmu_event, AO_STALLED_BY_ACC_IB) SEP \ + FUNC(pmu_event, AO_STALLED_BY_ACC) SEP \ + FUNC(pmu_event, AO_STALLED_BY_IB) SEP \ + FUNC(pmu_event, WD_ACTIVE) SEP \ + FUNC(pmu_event, WD_STALLED) SEP \ + FUNC(pmu_event, WD_STALLED_BY_WS) SEP \ + FUNC(pmu_event, WD_STALLED_BY_WD_BUF) SEP \ + FUNC(pmu_event, WD_PARSE_ACTIVE) SEP \ + FUNC(pmu_event, WD_PARSE_STALLED) SEP \ + FUNC(pmu_event, WD_PARSE_STALLED_IN) SEP \ + FUNC(pmu_event, WD_PARSE_STALLED_OUT) SEP \ + FUNC(pmu_event, WD_TRANS_WS) SEP \ + FUNC(pmu_event, WD_TRANS_WB) SEP \ + FUNC(pmu_event, WD_TRANS_DW0) SEP \ + FUNC(pmu_event, WD_TRANS_DW1) SEP \ + FUNC(pmu_event, AXI0_RD_TRANS_ACCEPTED) SEP \ + FUNC(pmu_event, AXI0_RD_TRANS_COMPLETED) SEP \ + FUNC(pmu_event, AXI0_RD_DATA_BEAT_RECEIVED) SEP \ + FUNC(pmu_event, AXI0_RD_TRAN_REQ_STALLED) SEP \ + FUNC(pmu_event, AXI0_WR_TRANS_ACCEPTED) SEP \ + FUNC(pmu_event, AXI0_WR_TRANS_COMPLETED_M) SEP \ + FUNC(pmu_event, AXI0_WR_TRANS_COMPLETED_S) SEP \ + FUNC(pmu_event, AXI0_WR_DATA_BEAT_WRITTEN) SEP \ + FUNC(pmu_event, AXI0_WR_TRAN_REQ_STALLED) SEP \ + FUNC(pmu_event, AXI0_WR_DATA_BEAT_STALLED) SEP \ + FUNC(pmu_event, AXI0_ENABLED_CYCLES) SEP \ + FUNC(pmu_event, AXI0_RD_STALL_LIMIT) SEP \ + FUNC(pmu_event, AXI0_WR_STALL_LIMIT) SEP \ + FUNC(pmu_event, AXI_LATENCY_ANY) SEP \ + FUNC(pmu_event, AXI_LATENCY_32) SEP \ + FUNC(pmu_event, AXI_LATENCY_64) SEP \ + FUNC(pmu_event, AXI_LATENCY_128) SEP \ + FUNC(pmu_event, AXI_LATENCY_256) SEP \ + FUNC(pmu_event, AXI_LATENCY_512) SEP \ + FUNC(pmu_event, AXI_LATENCY_1024) SEP \ + FUNC(pmu_event, ECC_DMA) SEP \ + FUNC(pmu_event, ECC_SB0) SEP \ + FUNC(pmu_event, AXI1_RD_TRANS_ACCEPTED) SEP \ + FUNC(pmu_event, AXI1_RD_TRANS_COMPLETED) SEP \ + FUNC(pmu_event, AXI1_RD_DATA_BEAT_RECEIVED) SEP \ + FUNC(pmu_event, AXI1_RD_TRAN_REQ_STALLED) SEP \ + FUNC(pmu_event, AXI1_WR_TRANS_ACCEPTED) SEP \ + FUNC(pmu_event, AXI1_WR_TRANS_COMPLETED_M) SEP \ + FUNC(pmu_event, AXI1_WR_TRANS_COMPLETED_S) SEP \ + FUNC(pmu_event, AXI1_WR_DATA_BEAT_WRITTEN) SEP \ + FUNC(pmu_event, AXI1_WR_TRAN_REQ_STALLED) SEP \ + FUNC(pmu_event, AXI1_WR_DATA_BEAT_STALLED) SEP \ + FUNC(pmu_event, AXI1_ENABLED_CYCLES) SEP \ + FUNC(pmu_event, AXI1_RD_STALL_LIMIT) SEP \ + FUNC(pmu_event, AXI1_WR_STALL_LIMIT) SEP \ + FUNC(pmu_event, ECC_SB1) + +#define EXPAND_POOLING_MODE(FUNC, SEP) \ + FUNC(pooling_mode, MAX) SEP \ + FUNC(pooling_mode, AVERAGE) SEP \ + FUNC(pooling_mode, REDUCE_SUM) + +#define EXPAND_PRIVILEGE_LEVEL(FUNC, SEP) \ + FUNC(privilege_level, USER) SEP \ + FUNC(privilege_level, PRIVILEGED) + +#define EXPAND_ROUND_MODE(FUNC, SEP) \ + FUNC(round_mode, DBL) SEP \ + FUNC(round_mode, TRUNCATE) SEP \ + FUNC(round_mode, NATURAL) + +#define EXPAND_SECURITY_LEVEL(FUNC, SEP) \ + FUNC(security_level, SECURE) SEP \ + FUNC(security_level, NON_SECURE) + +#define EXPAND_STATE(FUNC, SEP) \ + FUNC(state, STOPPED) SEP \ + FUNC(state, RUNNING) + +#define EXPAND_WD_CORE_SLICE_STATE(FUNC, SEP) \ + FUNC(wd_core_slice_state, HEADER) SEP \ + FUNC(wd_core_slice_state, PALETTE) SEP \ + FUNC(wd_core_slice_state, WEIGHTS) + +#define EXPAND_WD_CTRL_STATE(FUNC, SEP) \ + FUNC(wd_ctrl_state, IDLE) SEP \ + FUNC(wd_ctrl_state, DRAIN) SEP \ + FUNC(wd_ctrl_state, OFD_INIT) SEP \ + FUNC(wd_ctrl_state, OFD_RUN) + +#define EXPAND_WEIGHT_ORDER(FUNC, SEP) \ + FUNC(weight_order, DEPTH_FIRST) SEP \ + FUNC(weight_order, PART_KERNEL_FIRST) + +#ifdef __cplusplus +} +#endif diff --git a/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.cpp new file mode 100644 index 00000000..75bc9b25 --- /dev/null +++ b/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.cpp @@ -0,0 +1,40 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u65_register_cs_generator.hpp" + +#include "ethos_u65.hpp" +#define NPU_NAMESPACE ethosu65 +#include "ethos_u65_interface.hpp" + +namespace regor +{ + +using namespace ethosu65; + +EthosU65RCSGenerator::EthosU65RCSGenerator(ArchEthosU65 *arch) : EthosU55RCSGenerator(arch), _arch(arch) +{ +} + +void EthosU65RCSGenerator::GenerateInitialRegisterSetup() +{ + auto mode = _arch->_cores <= 1 ? parallel_mode::SINGLE_CORE : parallel_mode::DUAL_CORE_DEPTH; + Emit(isa::npu_set_parallel_mode_t(mode)); +} + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.hpp new file mode 100644 index 00000000..f3448e17 --- /dev/null +++ b/ethosu/regor/architecture/ethosu65/ethos_u65_register_cs_generator.hpp @@ -0,0 +1,41 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/ethosu55/ethos_u55_register_cs_generator.hpp" +#include "ethos_u65.hpp" + +namespace regor +{ +/// +/// Generates register command streams for Ethos U55 and Ethos U65. +/// +class EthosU65RCSGenerator : public EthosU55RCSGenerator +{ +public: + EthosU65RCSGenerator(ArchEthosU65 *arch); + +protected: + void GenerateInitialRegisterSetup() override; + +private: + ArchEthosU65 *_arch; +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp new file mode 100644 index 00000000..e9b04ad6 --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85.cpp @@ -0,0 +1,1324 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u85.hpp" + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "common/bit_flags.hpp" +#include "common/numeric_util.hpp" +#include "ethos_u85_performance.hpp" +#include "ethos_u85_register_cs_generator.hpp" +#include "ethos_u85_weight_encoder.hpp" + +#include +#include +#include +#include + +BEGIN_ENUM_TABLE(regor::EthosU85Accumulator) + ADD_ENUM_NAME(Acc32) + ADD_ENUM_NAME(Acc48) +END_ENUM_TABLE() + +BEGIN_ENUM_TABLE(regor::EthosU85Traversal) + ADD_ENUM_NAME(DepthFirst) + ADD_ENUM_NAME(PartKernel) + ADD_ENUM_NAME(Depthwise) +END_ENUM_TABLE() + +namespace regor +{ + +unsigned MaskForNpuOp(const EthosU85NpuOp npuOp, bool hasIfm2); +bool IsMinMaxReduction(OpType opType, const Kernel *kernel); + +static const EthosU85PerfInfo s_EthosU85PerfInfo[] = { + // Accelerator.Ethos_U85_128 + {{2.0, 3.0, 3.0, 3.0, 4.0, 6.0, 1.0, 2.0}, {1.0, 1.0, 0.0}}, + // Accelerator.Ethos_U85_256 + {{2.0, 3.0, 3.0, 3.0, 4.0, 6.0, 1.0, 2.0}, {1.0, 1.0, 0.0}}, + // Accelerator.Ethos_U85_512 + {{1.0, 1.5, 1.5, 1.5, 2.0, 3.0, 0.5, 1.0}, {1.0, 1.0, 0.0}}, + // Accelerator.Ethos_U85_1024 + {{0.75, 1.25, 0.75, 0.75, 1.0, 1.5, 0.25, 0.5}, {1.0, 0.5, 0.0}}, + // Accelerator.Ethos_U85_2048 + {{0.625, 1.125, 0.5, 0.375, 0.5, 0.75, 0.125, 0.25}, {1.0, 0.25, 0.0}}, +}; + +static const ArchEthosU85::AcceleratorConfig s_EthosU85Configs[] = { + // Accelerator.Ethos_U85_128 + {128, 1, {Shape(1, 2, 8), Shape(1, 1, 16)}, 2, Shape(1, 2, 8), 8 * 1024, 8 * 1024, 2, 1, 0, &s_EthosU85PerfInfo[0]}, + // Accelerator.Ethos_U85_256 + {256, 1, {Shape(1, 2, 16), Shape(1, 4, 8), Shape(2, 2, 8)}, 3, Shape(2, 2, 8), 16 * 1024, 16 * 1024, 4, 1, 0, &s_EthosU85PerfInfo[0]}, + // Accelerator.Ethos_U85_512 + {512, 2, {Shape(2, 2, 16), Shape(1, 4, 16)}, 2, Shape(2, 2, 16), 16 * 1024, 32 * 1024, 8, 1, 0, &s_EthosU85PerfInfo[1]}, + // Accelerator.Ethos_U85_1014 + {1024, 4, {Shape(2, 2, 32), Shape(1, 4, 32), Shape(2, 4, 16)}, 3, Shape(4, 2, 16), 16 * 1024, 64 * 1024, 16, 1, 1, &s_EthosU85PerfInfo[2]}, + // Accelerator.Ethos_U85_2048 + {2048, 4, {Shape(2, 2, 64), Shape(1, 4, 64), Shape(4, 4, 16)}, 3, Shape(4, 4, 16), 32 * 1024, 128 * 1024, 32, 2, 1, &s_EthosU85PerfInfo[3]}, +}; + +enum class ElementwiseUsage +{ + No = 0, + Full = 1, + Scalar = 2, +}; + +static int AccumulatorBits(EthosU85Accumulator accType) +{ + int bits = 32; + switch ( accType ) + { + case EthosU85Accumulator::Acc32: + bits = 32; + break; + case EthosU85Accumulator::Acc48: + bits = 64; + break; + default: + LOG_WARN("Invalid accumulator type for Ethos U85: {}\n", accType); + assert(false); + break; + } + return bits; +} + + +ArchEthosU85::ArchEthosU85() : _subkernelMax(8, 8, 65536), _ofmBlockMax(128, 128, 1024) +{ + _weightEncoder = std::make_unique(this); + _rcsGenerator = std::make_unique(this); +} + +uint32_t ArchEthosU85::Version() +{ + return EthosU85RCSGenerator::IdRegister(); +} + +bool ArchEthosU85::ParseConfig(IniReader *reader) +{ + // Parse architecture configuration + std::string key; + int macs = 0; + while ( reader->Begin(key) ) + { + if ( key == "macs" ) + { + macs = reader->Get(); + } + reader->End(); + } + + // Find the requested MAC configuration for this accelerator + auto cfg = std::find_if(s_EthosU85Configs, std::cend(s_EthosU85Configs), + [&](const AcceleratorConfig &config) { return config.macs == macs; }); + if ( cfg == std::cend(s_EthosU85Configs) ) + { + assert(macs == 128 || macs == 256 || macs == 512 || macs == 1024 || macs == 2048); + LOG_TRACE0("Unable to find Ethos U85 accelerator for macs={}", macs); + return false; + } + + ApplyConfig(cfg); + + return true; +} + +void ArchEthosU85::ApplyConfig(const AcceleratorConfig *cfg) +{ + // Basic configuration + _cores = cfg->cores; + _macs = cfg->macs; + _ifmUBlock = cfg->ifmUBlock; + _nOfmUBlocks = cfg->nOfmUBlocks; + std::copy(std::begin(cfg->ofmUBlocks), std::end(cfg->ofmUBlocks), std::begin(_ofmUBlocks)); + + // Internal memory + _ifmRamSizeBytes = cfg->ifmRamSizeBytes; + _accRamSizeBytes = cfg->accRamSizeBytes; + _numAxiSramLog2 = cfg->numAxiSramLog2; + _numAxiExtLog2 = cfg->numAxiExtLog2; + + _lutRam = std::make_unique("lutram", 2048); + // TODO MLBEDSW-7980 fix LUT performance parameters + _lutRam->SetParameters(1, 0, 0, 1, 0); + _lutMemory = _lutRam.get(); + _performance = std::unique_ptr(new EthosU85Performance(this, cfg->perfInfo)); + + // Populate ofmUBlock -> NpuOp lookup table + SetupOfmUBlockToOpTable(); + // Populate ofmUBlock -> ifmAlloc unit table + SetupOfmUBlockToIfmAuTable(); +} + + +std::unique_ptr ArchEthosU85::GetOpConfig(OpType opType, const ArchitectureConfigQuery &query) +{ + auto config = FindBlockConfig(opType, query); + return config; +} + + +std::unique_ptr ArchEthosU85::CreateOpGroup(const ArchitectureOpGroupQuery &op) +{ + LOG_TRACE1("Trying to create ArchEthosU85 OpGroup for {}\n", OpTypeToString(op.type)); + + auto group = std::make_unique(); + if ( !group->Add(op) ) + { + return nullptr; + } + + return group; +} + +std::vector ArchEthosU85::ConfigRegisters() +{ + return std::vector(1, ConfigRegister(2)); +} + +int ArchEthosU85::UpscaleAndRounding(ArchResampling resampling, int &rounding) +{ + rounding = (resampling == ArchResampling::Nearest) ? 1 : 0; + return (resampling == ArchResampling::Zeros) ? 2 : 1; +} + +AxisMask ArchEthosU85::CanSubdivide(OpType opType) +{ + if ( IsConvolution(opType) || IsElementwise(opType) || IsPooling(opType) ) + { + return AxisMask::AxisY; + } + return AxisMask::None; +} + +bool ArchEthosU85::SupportsLeakyRelu(bool /*quantized*/, DataType /*type*/) +{ + return true; +} + +bool ArchEthosU85::SupportsMatMul(OpType opType) +{ + EthosU85NpuOp npuOp = GetHWOp(opType); + if ( npuOp == EthosU85NpuOp::None ) + { + return false; + } + + return true; +} + +bool ArchEthosU85::SupportsTranspose(OpType opType, TransposeType transposeType) +{ + if ( IsNone(transposeType) ) return true; + + EthosU85NpuOp npuOp = GetHWOp(opType); + if ( npuOp == EthosU85NpuOp::None || npuOp == EthosU85NpuOp::Resize || npuOp == EthosU85NpuOp::Dma ) + { + return false; + } + else if ( npuOp == EthosU85NpuOp::Elementwise ) + { + return transposeType == TransposeType::NHWC || transposeType == TransposeType::NHCW || transposeType == TransposeType::NCHW; + } + + return transposeType == TransposeType::NHWC || transposeType == TransposeType::NWHC || transposeType == TransposeType::NHCW || + transposeType == TransposeType::NWCH || transposeType == TransposeType::NCHW || transposeType == TransposeType::NCWH; +} + +bool ArchEthosU85::SupportsReverse(OpType opType, ReverseType reverseType) +{ + if ( reverseType == ReverseType::None ) return true; + + EthosU85NpuOp npuOp = GetHWOp(opType); + if ( npuOp == EthosU85NpuOp::None || npuOp == EthosU85NpuOp::Elementwise || npuOp == EthosU85NpuOp::Dma ) + { + return false; + } + + return reverseType == ReverseType::H || reverseType == ReverseType::W || reverseType == ReverseType::C; +} + +bool ArchEthosU85::SupportsGather(OpType opType) +{ + EthosU85NpuOp npuOp = GetHWOp(opType); + if ( npuOp == EthosU85NpuOp::None ) + { + return false; + } + + return true; +} + +bool ArchEthosU85::SupportsScatter(OpType opType) +{ + EthosU85NpuOp npuOp = GetHWOp(opType); + if ( npuOp == EthosU85NpuOp::None ) + { + return false; + } + + return true; +} + +bool ArchEthosU85::SupportsSigmoidTanhLutInt16(OpType opType) +{ + return (opType == OpType::Sigmoid || opType == OpType::Tanh); +} + +bool ArchEthosU85::SupportsArgMax(OpType opType) +{ + EthosU85NpuOp npuOp = GetHWOp(opType); + if ( npuOp == EthosU85NpuOp::None ) + { + return false; + } + + return true; +} + +bool ArchEthosU85::SupportsResize(const ResizeSupportQuery &query) +{ + /* Supported operator checks for resize operations + * + * * Scaling numerators must be less than or equal to 2048 + * * Offsets must be in the range [-numerator, numerator) for each axis + * * The following constraints apply to upscale-factors + * mode REPLICATE: + * Any width and height upscale-factors are supported + * mode NEAREST: + * Any width and height upscale-factors are supported + * mode BILINEAR: + * if IFM W*H == 1*1: + * Any width and height upscale-factors are supported + * else: + * The upscale-factors need to be powers-of-two. + */ + if ( query.ifmShape.Width() == 1 && query.ifmShape.Height() == 1 ) + { + return true; + } + + int n_w = query.scaleX.n; + int d_w = query.scaleX.d; + int n_h = query.scaleY.n; + int d_h = query.scaleY.d; + bool supported = true; + + if ( n_h > 2048 ) + { + LOG_WARN("Resize height scale numerator ({}) exceeds maximum size (2048).\n", n_h); + supported = false; + } + if ( n_w > 2048 ) + { + LOG_WARN("Resize width scale numerator ({}) exceeds maximum size (2048).\n", n_w); + supported = false; + } + if ( query.offsetY >= n_h || query.offsetY < -n_h ) + { + LOG_WARN("Resize height offset: {} is outside the valid range [-height_numerator, height_numerator) = [{}, {})\n", + query.offsetY, -n_h, n_h); + supported = false; + } + if ( query.offsetX >= n_w || query.offsetX < -n_w ) + { + LOG_WARN("Resize width offset: {} is outside the valid range [-with_numerator, width_numerator) = [{}, {})\n", + query.offsetX, -n_w, n_w); + supported = false; + } + + if ( query.mode == ArchResizeMode::Bilinear ) + { + // Get scale fractions and verify that scale-factor is a power of two. + + if ( n_w % d_w != 0 ) + { + LOG_WARN("ResizeBilinear width scale-factor is not an integer: {}/{}\n", n_w, d_w); + supported = false; + } + if ( n_h % d_h != 0 ) + { + LOG_WARN("ResizeBilinear height scale-factor is not an integer: {}/{}\n", n_h, d_h); + supported = false; + } + int scale_w = n_w / d_w; + int scale_h = n_h / d_h; + if ( !IsPowerOfTwo(scale_w) ) + { + LOG_WARN("ResizeBilinear width scale-factor is not a power of two: {}\n", double(n_w) / d_w); + supported = false; + } + if ( !IsPowerOfTwo(scale_h) ) + { + LOG_WARN("ResizeBilinear height scale-factor is not a power of two: {}\n", double(n_h) / d_h); + supported = false; + } + return supported; + } + return supported; +} + +bool ArchEthosU85::SupportsAccumulatorMode(ArchAccumulatorSource source, bool outputEnabled) +{ + UNUSED(outputEnabled); + return source == ArchAccumulatorSource::Reset || source == ArchAccumulatorSource::Acc || source == ArchAccumulatorSource::Ifm2; +} + +bool ArchEthosU85::SupportsScalar(OpType opType, DataType dataType, TensorUsage usage) +{ + bool supportedType(dataType == DataType::Int8 || dataType == DataType::UInt8 || dataType == DataType::Int16 || dataType == DataType::Int32); + return EthosU85RCSGenerator::IsSupportedElementwise(opType) && supportedType && IsIFM(usage); +} + +Flags ArchEthosU85::SupportedWeightFormat(OpType op) +{ + auto hwOp = GetHWOp(op); + if ( hwOp == EthosU85NpuOp::Convolution || hwOp == EthosU85NpuOp::VectorProduct ) + { + return Flags(WeightFormat::Default, WeightFormat::Fast, WeightFormat::Sparse2_4); + } + return Flags(WeightFormat::Default); +} + +bool ArchEthosU85::UseAvgPoolNop(OpType type) +{ + return IsActivation(type) || type == OpType::Quantize || type == OpType::MemoryCopy; +} + +static bool ChooseKernelMethod(const Shape &ifmShape, int ifmBits, const Kernel *kernel) +{ + if ( ifmShape.Depth() <= 8 ) + { + return true; + } + + // Compare part-kernel to depth-kernel and choose the one with best utilisation + int kernelElements = kernel->ElementsWH(); + double depthUtilisation = ifmShape.Depth() / double(RoundAway(ifmShape.Depth(), ifmBits == 8 ? 32 : 16)); + double partUtilisation = + (ifmShape.Depth() / double(RoundAway(ifmShape.Depth(), 8)) * + (kernelElements / double(RoundAway(kernelElements, ifmBits == 8 ? 4 : 2)))); + + return partUtilisation >= depthUtilisation; +} + + +static Shape GetArchIFMBlockSize(const Shape &ofmBlock, const Kernel *kernel, const Shape &ublock, + const Shape &subkernelLimit, int upscale, int rounding) +{ + Point2i dilatedSize = kernel->DilatedWH(); + + // IFM block height + int h = RequiredInputSize(ofmBlock.Height(), kernel->Stride().y, std::min(dilatedSize.y, subkernelLimit.Height()), upscale, rounding); + h = RoundAway(h, ublock.Height()); + + // IFM block width + int w = RequiredInputSize(ofmBlock.Width(), kernel->Stride().x, std::min(dilatedSize.x, subkernelLimit.Width()), upscale, rounding); + w = RoundAway(w, ublock.Width()); + + return Shape(1, h, w, ofmBlock.Depth()); +} + +unsigned MaskForNpuOp(const EthosU85NpuOp npuOp, bool hasIfm2 = false) +{ + if ( npuOp == EthosU85NpuOp::VectorProduct && hasIfm2 ) + { + // first bit is reserved for matmul + return 1; + } + return 1 << (int(npuOp)); +} + +int ArchEthosU85::IndexForOfmUBlock(const Shape &ofmUBlock) +{ + auto it = std::find(_ofmUBlocks.begin(), _ofmUBlocks.end(), ofmUBlock); + if ( it == _ofmUBlocks.end() ) + { + LOG_WARN("OFM microblock {} is not supported for this configuration\n", ofmUBlock.ToString()); + assert(false); + } + return int(std::distance(_ofmUBlocks.begin(), it)); +} + +void ArchEthosU85::SetupOfmUBlockToIfmAuTable() +{ + if ( _macs == 128 ) + { + int b_1x2x8 = IndexForOfmUBlock(Shape(1, 2, 8)); + int b_1x1x16 = IndexForOfmUBlock(Shape(1, 1, 16)); + _uBlockToIfmAuTable[b_1x2x8] = {Shape(1, 2, 1), Shape(1, 1, 2), Shape(1, 1, 2)}; + _uBlockToIfmAuTable[b_1x1x16] = _uBlockToIfmAuTable[b_1x2x8]; + } + else if ( _macs == 256 ) + { + int b_2x2x8 = IndexForOfmUBlock(Shape(2, 2, 8)); + int b_1x4x8 = IndexForOfmUBlock(Shape(1, 4, 8)); + int b_1x2x16 = IndexForOfmUBlock(Shape(1, 2, 16)); + _uBlockToIfmAuTable[b_2x2x8] = {Shape(2, 2, 1), Shape(1, 2, 2), Shape(1, 1, 4)}; + _uBlockToIfmAuTable[b_1x2x16] = _uBlockToIfmAuTable[b_2x2x8]; + _uBlockToIfmAuTable[b_1x4x8] = {Shape(1, 4, 1), Shape(1, 2, 2), Shape(1, 1, 4)}; + } + else if ( _macs == 512 ) + { + int b_2x2x16 = IndexForOfmUBlock(Shape(2, 2, 16)); + int b_1x4x16 = IndexForOfmUBlock(Shape(1, 4, 16)); + _uBlockToIfmAuTable[b_2x2x16] = {Shape(2, 2, 1), Shape(1, 2, 2), Shape(1, 1, 4)}; + _uBlockToIfmAuTable[b_1x4x16] = {Shape(1, 4, 1), Shape(1, 2, 2), Shape(1, 1, 4)}; + } + else if ( _macs == 1024 ) + { + int b_2x2x32 = IndexForOfmUBlock(Shape(2, 2, 32)); + int b_1x4x32 = IndexForOfmUBlock(Shape(1, 4, 32)); + int b_2x4x16 = IndexForOfmUBlock(Shape(2, 4, 16)); + _uBlockToIfmAuTable[b_2x2x32] = {Shape(2, 4, 1), Shape(2, 2, 2), Shape(1, 2, 4)}; + _uBlockToIfmAuTable[b_2x4x16] = _uBlockToIfmAuTable[b_2x2x32]; + _uBlockToIfmAuTable[b_1x4x32] = {Shape(2, 4, 1), Shape(1, 4, 2), Shape(1, 2, 4)}; + } + else + { + int b_2x2x64 = IndexForOfmUBlock(Shape(2, 2, 64)); + int b_1x4x64 = IndexForOfmUBlock(Shape(1, 4, 64)); + int b_4x4x16 = IndexForOfmUBlock(Shape(4, 4, 16)); + _uBlockToIfmAuTable[b_2x2x64] = {Shape(4, 4, 1), Shape(2, 4, 2), Shape(2, 2, 4)}; + _uBlockToIfmAuTable[b_4x4x16] = _uBlockToIfmAuTable[b_2x2x64]; + _uBlockToIfmAuTable[b_1x4x64] = {Shape(4, 4, 1), Shape(2, 4, 2), Shape(1, 4, 4)}; + } +} + +void ArchEthosU85::SetupOfmUBlockToOpTable() +{ + unsigned conv = MaskForNpuOp(EthosU85NpuOp::Convolution); + unsigned depthwise = MaskForNpuOp(EthosU85NpuOp::Depthwise); + unsigned vectorprod = MaskForNpuOp(EthosU85NpuOp::VectorProduct); + unsigned pool = MaskForNpuOp(EthosU85NpuOp::Pooling); + unsigned reducesum = MaskForNpuOp(EthosU85NpuOp::ReduceSum); + unsigned elementwise = MaskForNpuOp(EthosU85NpuOp::Elementwise); + unsigned resize = MaskForNpuOp(EthosU85NpuOp::Resize); + unsigned matmul = MaskForNpuOp(EthosU85NpuOp::VectorProduct, true); + unsigned dma = MaskForNpuOp(EthosU85NpuOp::Dma, true); + + // clang-format off + if ( _macs == 128 ) + { + unsigned b_1x2x8 = IndexForOfmUBlock(Shape(1, 2, 8)); + unsigned b_1x1x16 = IndexForOfmUBlock(Shape(1, 1, 16)); + _uBlockToOpTable[b_1x2x8] = { + // 8 bit ifm + conv | matmul | vectorprod | reducesum | elementwise | resize, + // 16 bit ifm + conv | matmul | vectorprod | depthwise | pool | reducesum | elementwise | resize, + // 32 bit ifm + reducesum | elementwise | resize, + }; + _uBlockToOpTable[b_1x1x16] = { + depthwise | pool | elementwise | resize, + vectorprod | elementwise | resize, + elementwise | resize + }; + } + else if ( _macs == 256 ) + { + unsigned b_2x2x8 = IndexForOfmUBlock(Shape(2, 2, 8)); + unsigned b_1x4x8 = IndexForOfmUBlock(Shape(1, 4, 8)); + unsigned b_1x2x16 = IndexForOfmUBlock(Shape(1, 2, 16)); + _uBlockToOpTable[b_2x2x8] = { + conv | matmul | vectorprod | reducesum | elementwise | resize, + conv | matmul | vectorprod | depthwise | pool | reducesum | elementwise | resize, + reducesum | elementwise | resize + }; + _uBlockToOpTable[b_1x4x8] = { + conv | matmul | vectorprod | reducesum | elementwise | resize, + conv | matmul | vectorprod | depthwise | pool | reducesum | elementwise | resize, + reducesum | elementwise | resize + }; + _uBlockToOpTable[b_1x2x16] = { + depthwise | pool | elementwise | resize, + vectorprod | elementwise | resize, + elementwise | resize + }; + } + else if ( _macs == 512 ) + { + unsigned b_2x2x16 = IndexForOfmUBlock(Shape(2, 2, 16)); + unsigned b_1x4x16 = IndexForOfmUBlock(Shape(1, 4, 16)); + _uBlockToOpTable[b_2x2x16] = { + conv | depthwise | vectorprod | pool | reducesum | elementwise | resize | matmul, + conv | depthwise | vectorprod | pool | reducesum | elementwise | resize | matmul, + reducesum | elementwise | resize, + }; + _uBlockToOpTable[b_1x4x16] = { + conv | depthwise | vectorprod | pool | reducesum | elementwise | resize | matmul, + conv | depthwise | vectorprod | pool | reducesum | elementwise | resize | matmul, + reducesum | elementwise | resize, + }; + } + else if ( _macs == 1024 ) + { + unsigned b_2x2x32 = IndexForOfmUBlock(Shape(2, 2, 32)); + unsigned b_1x4x32 = IndexForOfmUBlock(Shape(1, 4, 32)); + unsigned b_2x4x16 = IndexForOfmUBlock(Shape(2, 4, 16)); + _uBlockToOpTable[b_2x2x32] = { + conv | matmul | vectorprod | elementwise, + conv | matmul | vectorprod | elementwise, + elementwise, + }; + _uBlockToOpTable[b_1x4x32] = { + conv | matmul | vectorprod | elementwise, + conv | matmul | vectorprod | elementwise, + elementwise, + }; + _uBlockToOpTable[b_2x4x16] = { + conv | vectorprod | depthwise | pool | reducesum | elementwise | resize, + conv | vectorprod | depthwise | pool | reducesum | elementwise | resize, + reducesum | elementwise | resize, + }; + } + else + { // 2048 + unsigned b_2x2x64 = IndexForOfmUBlock(Shape(2, 2, 64)); + unsigned b_1x4x64 = IndexForOfmUBlock(Shape(1, 4, 64)); + unsigned b_4x4x16 = IndexForOfmUBlock(Shape(4, 4, 16)); + _uBlockToOpTable[b_2x2x64] = { + conv | matmul | vectorprod | elementwise, + conv | matmul | vectorprod | elementwise, + elementwise, + }; + _uBlockToOpTable[b_1x4x64] = { + conv | matmul | vectorprod | elementwise, + conv | matmul | vectorprod | elementwise, + elementwise, + }; + _uBlockToOpTable[b_4x4x16] = { + conv | vectorprod | depthwise | pool | reducesum | elementwise | resize, + conv | vectorprod | depthwise | pool | reducesum | elementwise | resize, + reducesum | elementwise | resize, + }; + } + // clang-format on +} + +bool ArchEthosU85::IsUBlockValid(const OpType opType, int ifmBits, const Shape &ofmUBlock, bool hasIfm2) +{ + EthosU85NpuOp npuOp = GetHWOp(opType); + if ( npuOp == EthosU85NpuOp::None ) + { + return false; + } + + unsigned blockIdx = IndexForOfmUBlock(ofmUBlock); + if ( blockIdx >= _uBlockToOpTable.size() ) + { + LOG_WARN("OFM microblock {} is not a valid block for Ethos U85-{}\n", ofmUBlock.ToString(), _macs); + return false; + } + + auto &bitsToOperations = _uBlockToOpTable[blockIdx]; + + unsigned bitIdx = (ifmBits / 16); + if ( bitIdx >= bitsToOperations.size() ) + { + LOG_DEBUG("(OFM microblock validation - ifmbits: {} is not a valid ifm precision\n", ifmBits); + return false; + } + + // one-hot encoded mask for NpuOp operations + unsigned opmask = MaskForNpuOp(npuOp, hasIfm2); + return bitsToOperations[bitIdx] & opmask; +} + +bool IsMinMaxReduction(OpType opType, const Kernel *kernel) +{ + // MIN/MAX Reduction over width or height is defined as a MAX/MIN-pool with 1-D kernel. + return (opType == OpType::MaxPool || opType == OpType::Min) && (kernel->Size().x == 1 || kernel->Size().y == 1); +} + +Shape ArchEthosU85::FindUBlock(OpType opType, const ArchitectureConfigQuery &query) +{ + int lookupBits = query.ifmBits; + if ( IsMinMaxReduction(opType, query.kernel) && lookupBits == 32 ) + { + // 16-bit microblock lookup-table is used for + // 32-bit Min/Max reductions. + lookupBits = 16; + } + + const EthosU85NpuOp npuOp = GetHWOp(opType); + assert(npuOp != EthosU85NpuOp::None); + + int bestWaste = std::numeric_limits::max(); + Shape bestUblk; + + for ( int i = 0; i < _nOfmUBlocks; i++ ) + { + const Shape &ublk = _ofmUBlocks[i]; + if ( !IsUBlockValid(opType, lookupBits, ublk, query.ifmShape[1] != Shape()) ) + { + continue; + } + + Shape tmp = Shape::RoundAway(query.ofmShape, ublk); + int waste = tmp.Elements() - query.ofmShape.Elements(); + if ( waste < bestWaste ) + { + bestUblk = ublk; + bestWaste = waste; + } + } + + return bestUblk; +} + +std::unique_ptr ArchEthosU85::FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query) +{ + assert(query.ifmBits > 0 && query.ifmBits <= 32); + assert(query.ofmShape.Size() > 2 && "Insufficient dimensions to search for block config"); + assert(query.kernel != nullptr); + + if ( !SupportsAccumulatorMode(query.accSource, query.accOutputEnabled) ) return nullptr; + + const int OFMSplitDepth = 16; // Specific to this architecture + + // Elementwise larger-volume correction + const Shape &ifmShape = (query.ifmShape[1].Elements() > query.ifmShape[0].Elements()) ? query.ifmShape[1] : query.ifmShape[0]; + + EthosU85NpuOp npuOp = GetHWOp(opType); + assert(npuOp != EthosU85NpuOp::None); + + // Operator typing help + bool isPooling = npuOp == EthosU85NpuOp::Pooling || npuOp == EthosU85NpuOp::ReduceSum; + bool isReduceSum = npuOp == EthosU85NpuOp::ReduceSum; + bool isDepthwise = npuOp == EthosU85NpuOp::Depthwise; + bool isElementwise = npuOp == EthosU85NpuOp::Elementwise; + bool isConvolution = npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::Depthwise; + bool isResize = npuOp == EthosU85NpuOp::Resize; + bool isDma = npuOp == EthosU85NpuOp::Dma; + bool isPartKernel = isConvolution && ChooseKernelMethod(ifmShape, query.ifmBits, query.kernel); + bool isEqualDepthOp = isElementwise || (isPooling && !isReduceSum) || isDepthwise || isResize; + + if ( isDma ) + { + // DMA ops doesn't use block config + return nullptr; + } + + // Operator configuration to be returned + auto config = std::make_unique(); + + EthosU85Traversal traversal = isDepthwise ? EthosU85Traversal::Depthwise : (isPartKernel ? EthosU85Traversal::PartKernel : EthosU85Traversal::DepthFirst); + + // Accumulator settings + EthosU85Accumulator accType = EthosU85Accumulator::Acc32; + if ( query.ifmBits == 16 && (!isPooling || isReduceSum) && query.scaled ) + { + accType = EthosU85Accumulator::Acc48; + } + else if ( query.ifmBits == 64 && isPooling ) + { + // Special case for Rescale int48 + accType = EthosU85Accumulator::Acc48; + } + + int accBits = AccumulatorBits(accType); + int rounding; + int upscale = UpscaleAndRounding(query.ifmResampling, rounding); + int numBlocksInRam = 2; + + const Shape ofmUBlock = FindUBlock(opType, query); + if ( ofmUBlock == Shape() ) + { + // no valid ofm microblock found + LOG_WARN("Could not find a valid OFM microblock for {} with {}-bit input.\n", OpTypeToString(opType), query.ifmBits); + return nullptr; + } + + // Subkernel repeats of the IFM + Point2i dilatedWH = query.kernel->DilatedWH(); + int ifmRepeats = DivRoundUp(dilatedWH.x, _subkernelMax.Width()) * DivRoundUp(dilatedWH.y, _subkernelMax.Height()); + + int ifmBlockDepth = 0; + const bool sparse = query.weightFormat & WeightFormat::Sparse2_4; + if ( isPartKernel ) + { + ifmBlockDepth = 16; + } + else if ( query.ifmBits == 32 || ((_macs == 128 || _macs == 256) && ofmUBlock.Depth() == 16 && !sparse) ) + { + ifmBlockDepth = 32; + } + else if ( sparse && traversal == EthosU85Traversal::DepthFirst && query.ifmBits == 8 ) + { + ifmBlockDepth = 128; + } + else + { + ifmBlockDepth = 64; + } + + // Weights fetch (for operators that have them) + int weightFetchWH = isConvolution ? query.kernel->Size().AreaXY() : 0; + + int ofmUBlockDepth = ofmUBlock.Depth(); + + // When using brick format and certain transposes, there are additional constraints to the block size, so we must + // extend the search space to be able to find a valid block size. + Shape ofmBlockMin = Shape(0, 0, 0); + if ( query.ofmFormat == TensorFormat::NHCWB16 ) + { + switch ( query.transpose ) + { + case TransposeType::NCHW: + case TransposeType::NHCW: + ofmBlockMin = ofmBlockMin.WithWidth(16); + break; + case TransposeType::NCWH: + case TransposeType::NWCH: + ofmBlockMin = ofmBlockMin.WithHeight(16); + break; + default: + break; + } + } + Shape searchSpaceStep = Shape::Max(ofmUBlock, ofmBlockMin); + Shape ofmBlockMaxTp = _ofmBlockMax.Untranspose(Reduce4To3(query.transpose)); + Shape searchSpaceEnd = Shape::RoundAway(Shape::Max(Shape::Min(query.ofmShape, ofmBlockMaxTp), searchSpaceStep), ofmUBlock); + + if ( isResize ) + { + // resize operations are constrained to OFM block height 1 and depth 1-16 + // TODO MLBEDSW-8573: Improve block config search for Resize/Elementwise operations + int resizeMaxWidth = CalcResizeMaxOfmBlockWidth(query.ifmBits, query.rescaling.scaleX.n, query.rescaling.scaleX.d); + // reduce minimal step if max width becomes smaller than the minimal step + if ( resizeMaxWidth < searchSpaceStep.Width() ) + { + searchSpaceStep = searchSpaceStep.WithWidth(resizeMaxWidth); + } + searchSpaceStep = searchSpaceStep.WithHeight(1); + searchSpaceEnd = searchSpaceEnd.WithHeight(1).WithDepth(16).WithWidth(resizeMaxWidth); + } + + // At this point, OFM is already configured to NHWC but we need to limit OFM block depth as well. + if ( query.reverse == ReverseType::C ) + { + searchSpaceEnd = Shape::Min(searchSpaceEnd, searchSpaceEnd.WithDepth(16)); + } + + // Block WHC search, loops across the search space looking for best efficiency + float bestCost = std::numeric_limits::infinity(); + float bestCoverage = std::numeric_limits::infinity(); + int ofmElements = query.ofmShape.Elements(); + + int depth = std::max(ofmUBlockDepth, std::min(searchSpaceEnd.Depth(), OFMSplitDepth)); + int restartDepth = depth; + if ( depth < query.ofmShape.Depth() ) + { + depth = RoundAway(depth, OFMSplitDepth); + } + + Shape ifmAllocUnit = CalcIfmAUSize(ifmBlockDepth, query.ifmBits, ofmUBlock); + + std::unordered_set> wontFit; + while ( depth <= searchSpaceEnd.Depth() ) + { + if ( isEqualDepthOp ) + { + // For equal depth ops, IFMBlockDepth == OFMBlockDepth + // Recalculate the IFM AU for the new depth + ifmBlockDepth = depth; + ifmAllocUnit = CalcIfmAUSize(depth, query.ifmBits, ofmUBlock); + } + + for ( int height = searchSpaceStep.Height(); height <= searchSpaceEnd.Height(); height += searchSpaceStep.Height() ) + { + for ( int width = searchSpaceStep.Width(); width <= searchSpaceEnd.Width(); width += searchSpaceStep.Width() ) + { + // Avoid checking W/H transposed blocks that already didn't fit. i.e. if 8x4x16 didn't + // fit, then 4x8x16 won't either. + if ( wontFit.count(Point2i(height, width)) > 0 ) + { + continue; + } + + // Calculate the IFM block dimensions required to feed this OFM block + Shape ofmBlock = Shape(height, width, depth); + + Shape ifmBlock = GetArchIFMBlockSize(ofmBlock, query.kernel, ifmAllocUnit, _subkernelMax, upscale, rounding); + ifmBlock = ifmBlock.WithDepth(ifmBlockDepth); + + // Test if the IFM/OFM blocks fit into RAM + if ( TryBlockConfig(npuOp, ofmBlock, ifmBlock, ifmShape, query.ifmBits, accBits, _ifmRamSizeBytes, + _accRamSizeBytes, ifmAllocUnit.Depth(), numBlocksInRam, isEqualDepthOp) ) + { + Shape fullBlocks = Shape::DivRoundUp(query.ofmShape, ofmBlock); + Point3 blocks = query.ofmShape.HWC() / ofmBlock.HWC(); + + // Weights fetching + float weightFetch = float(weightFetchWH) * ifmShape.Depth() * fullBlocks.ElementsWH(); + if ( !isDepthwise ) + { + weightFetch *= blocks.z * ofmBlock.Depth(); + } + + // IFM fetching + float ifmFetch = float(ifmBlock.ElementsWH()) * ifmShape.Depth() * ifmRepeats * blocks.x * blocks.y; + if ( !isEqualDepthOp ) + { + ifmFetch *= fullBlocks.Depth(); + } + + // Scale relative to every output OFM element + float relativeCost = + (isElementwise || isResize) ? float(ofmElements) / (float(height) * width * depth) : (ifmFetch + weightFetch) / float(ofmElements); + + // If the entire IFM can be encompassed by both buffers, bias to prefer this configuration + if ( ifmShape.Elements() < ifmBlock.Elements() * 2 ) + { + relativeCost = relativeCost / 2.0f; + } + + // Choose based on relative minimum cost or larger IFM area (if equal cost) + if ( relativeCost <= bestCost ) + { + bool chooseThis = false; + // Check IFM coverage only when it's equal best_cost and small OFM + if ( relativeCost == bestCost ) + { + Shape coverageShape = Shape::Min(ifmShape, ifmBlock); + float coverage = float(ifmShape.ElementsWH()) / float(coverageShape.ElementsWH()); + // Small 4x4 IFM constraint found through analysis of networks + if ( coverage <= bestCoverage && (height <= 4 && width <= 4) ) + { + bestCoverage = coverage; + chooseThis = true; + } + } + else + { + bestCoverage = std::numeric_limits::infinity(); + chooseThis = true; + } + + if ( chooseThis ) + { + bestCost = relativeCost; + config->_ifmBlock = std::move(ifmBlock); + config->_ofmBlock = Shape(1, height, width, depth); + } + } + } + else + { + wontFit.emplace(width, height); + } + } + } + + // Try Next block depth, rounded + depth = depth + ofmUBlockDepth; + if ( depth < query.ofmShape.Depth() ) + { + depth = RoundAway(depth, OFMSplitDepth); + } + if ( depth > searchSpaceEnd.Depth() && bestCost == std::numeric_limits::infinity() && numBlocksInRam == 2 ) + { + numBlocksInRam = 1; + depth = restartDepth; + } + } + + config->_ofmUBlock = std::move(ofmUBlock); + config->_accumulatorType = accType; + config->_accumulatorSource = query.accSource; + config->_accumulatorOutputEnabled = query.accOutputEnabled; + config->_ifmRamSizeBytes = _ifmRamSizeBytes; + config->_traversal = traversal; + + // Return the best configuration + if ( bestCost != std::numeric_limits::infinity() ) + { + return std::unique_ptr(config.release()); + } + + // Didn't find a configuration + return nullptr; +} + +Shape ArchEthosU85::CalcIfmAUSize(int ifmBlkDepth, int ifmBits, Shape ofmUBlk) +{ + int ifmu = 0; + int ifmDepthBits = ifmBlkDepth * ifmBits; + if ( ifmDepthBits > 256 ) + { + // ifmu3 + ifmu += 2; + } + else if ( ifmDepthBits > 128 ) + { + // ifmu2 + ifmu++; + } + assert(ifmu < 3); + unsigned blockIdx = IndexForOfmUBlock(ofmUBlk); + return _uBlockToIfmAuTable[blockIdx][ifmu]; +} + +int ArchEthosU85::CalcResizeMaxOfmBlockWidth(int ifmBits, int scaleN, int scaleD) +{ + // Calculate the maximum OfmBlockWidth that still allows + // the IFM block to fit in the chaining buffer + assert(scaleN > 0); + assert(scaleD > 0); + int numIfmCbSlots = _macs / 16; + if ( ifmBits == 16 ) + { + numIfmCbSlots /= 2; + } + int maxOfmBlkW = int(std::ceil(((numIfmCbSlots - 2) * scaleN + 1) / double(scaleD))); + maxOfmBlkW = std::max(1, std::min(maxOfmBlkW, _ofmBlockMax.Width())); + return maxOfmBlkW; +} + +bool ArchEthosU85::TryBlockConfig(EthosU85NpuOp npuOp, const Shape &ofmBlock, const Shape &ifmBlock, const Shape &ifmShape, + int ifmBits, int accBits, int ifmSpace, int accSpace, int ifmAuDepth, int numBlocksInRam, bool isEqualDepthOp) +{ + assert(accBits > 0); + assert((ifmBits >= 8) && ((ifmBits % 8) == 0)); + + // Elementwise and Resize don't use IB/AB. + if ( npuOp == EthosU85NpuOp::Elementwise || npuOp == EthosU85NpuOp::Resize ) + { + return true; + } + + // IFM Space + int ifmAlignDepth = ifmAuDepth * 128 / ifmBits; + int ifmBlockDepth = isEqualDepthOp ? ofmBlock.Depth() : std::min(ifmBlock.Depth(), ifmShape.Depth()); + ifmBlockDepth = RoundAway(ifmBlockDepth, ifmAlignDepth); + int ifmBytes = ifmBlock.ElementsWH() * ifmBlockDepth * (ifmBits / 8) * numBlocksInRam; + + // Accumulator space + int ofmBlockDepth = RoundAway(ofmBlock.Depth(), 16); + int accBytes = (ofmBlock.ElementsWH() * ofmBlockDepth * accBits) / 8 * numBlocksInRam; + + if ( ifmBytes > ifmSpace || accBytes > accSpace ) + { + return false; + } + + return true; +} + + +Shape ArchEthosU85::GetStorageRounding(TensorFormat format) +{ + if ( format == TensorFormat::NHCWB16 ) + { + return Shape(1, 1, 1, 16); + } + + return Shape(1, 1, 1, 1); +} + +uint32_t ArchEthosU85::ConfigRegister(int product) +{ + uint32_t macsLog2 = IntLog2(_macs); + uint32_t numWdLog2 = IntLog2(_cores); + + return EthosU85RCSGenerator::ConfigRegister(macsLog2, 1, _numAxiSramLog2, _numAxiExtLog2, numWdLog2, product); +} + + +std::unique_ptr EthosU85OpConfig::Clone() +{ + auto config = std::make_unique(); + config->_ifmRamSizeBytes = _ifmRamSizeBytes; + config->_traversal = _traversal; + config->_accumulatorType = _accumulatorType; + config->_accumulatorSource = _accumulatorSource; + config->_accumulatorOutputEnabled = _accumulatorOutputEnabled; + config->_ofmBlock = _ofmBlock; + config->_ofmUBlock = _ofmUBlock; + config->_ifmBlock = _ifmBlock; + return std::unique_ptr(config.release()); +} + +int EthosU85OpConfig::MaxIFMBuffering() +{ + return _ifmRamSizeBytes; +} + +Point2i EthosU85OpConfig::OptimalStripeGranule() +{ + return _ofmBlock.WH(); +} + +int EthosU85OpConfig::OptimalDepthGranule() +{ + return _ofmBlock.Depth(); +} + +std::string EthosU85OpConfig::ToString(bool full) +{ + std::string tmp = fmt::format("OFM Block=[{}], IFM Block=[{}], OFM UBlock=[{}] Traversal={}, AccType={}", _ofmBlock.ToString(), + _ifmBlock.ToString(), _ofmUBlock.ToString(), EnumToString(_traversal), EnumToString(_accumulatorType)); + UNUSED(full); + return tmp; +} + +EthosU85NpuOp ArchEthosU85::GetHWOp(OpType type) +{ + static const std::unordered_map toNpuOp = { + {OpType::DepthwiseConv2DBias, EthosU85NpuOp::Depthwise}, + {OpType::Conv2D, EthosU85NpuOp::Convolution}, + {OpType::Conv2DBackpropInput, EthosU85NpuOp::Convolution}, + {OpType::Conv2DBackpropInputSwitchedBias, EthosU85NpuOp::Convolution}, + {OpType::Conv2DBias, EthosU85NpuOp::Convolution}, + {OpType::ReduceSum, EthosU85NpuOp::ReduceSum}, + {OpType::FullyConnected, EthosU85NpuOp::VectorProduct}, + {OpType::MatMul, EthosU85NpuOp::VectorProduct}, + {OpType::MaxPool, EthosU85NpuOp::Pooling}, + {OpType::AvgPool, EthosU85NpuOp::Pooling}, + {OpType::QuantizedAvgPool, EthosU85NpuOp::Pooling}, + {OpType::QuantizedMaxPool, EthosU85NpuOp::Pooling}, + {OpType::Sum, EthosU85NpuOp::Pooling}, + {OpType::Min, EthosU85NpuOp::Pooling}, + {OpType::ArgMax, EthosU85NpuOp::Pooling}, + // TODO MLBEDSW-7986 add none pooling + {OpType::Resize, EthosU85NpuOp::Resize}, + {OpType::Gather, EthosU85NpuOp::Dma}, + {OpType::Scatter, EthosU85NpuOp::Dma}, + }; + + auto pos = toNpuOp.find(type); + if ( pos != toNpuOp.end() ) + { + return pos->second; + } + else if ( EthosU85RCSGenerator::IsSupportedElementwise(type) ) + { + return EthosU85NpuOp::Elementwise; + } + else if ( UseAvgPoolNop(type) ) + { + return EthosU85NpuOp::Pooling; + } + return EthosU85NpuOp::None; +} + +// TODO: this is activation fusing only +int EthosU85OpGroup::Add(const ArchitectureOpGroupQuery &op, const std::vector &dependsOn) +{ + LOG_TRACE1("Trying to add op {}\n", OpTypeToString(op.type)); + + if ( _opsCount >= 2 ) + { + // Can only fuse 2 ops + return 0; + } + + for ( int dep : dependsOn ) + { + if ( dep > 0 ) + { + // Don't validate user-specified (positive keys) dependencies + continue; + } + else if ( dep < 0 ) + { + // Convert to group generated keys (negative keys) to array index + dep = (-dep) - 1; + if ( dep >= _opsCount ) + { + // Missing dependency + return 0; + } + } + + const EthosU85OpGroup::OpInfo &prevOp = _ops[dep]; + if ( prevOp.ofm.key != op.ifm.key && prevOp.ofm.key != op.ifm2.key ) + { + // Can only fuse when ops are connected + return 0; + } + } + if ( !CanRunOnNPU(op) ) + { + // Can only fuse NPU ops + return 0; + } + + if ( _opsCount > 0 ) + { + if ( !IsActivation(op.type) ) + { + // Can only fuse with activation + return 0; + } + else if ( op.ifm.type == DataType::Int16 && (op.type == OpType::Sigmoid || op.type == OpType::Tanh) ) + { + // Can not fuse int16 Sigmoid and Tanh LUT since they require special scaling done by AvgPoolNop + return 0; + } + } + + // Generated key + int key = (-_opsCount) - 1; + + // Save copy of op + _ops[_opsCount].type = op.type; + _ops[_opsCount].ifm.key = op.ifm.key; + _ops[_opsCount].ifm.type = op.ifm.type; + _ops[_opsCount].ifm2.key = op.ifm2.key; + _ops[_opsCount].ifm2.type = op.ifm2.type; + _ops[_opsCount].ofm.key = op.ofm.key; + _ops[_opsCount].ofm.type = op.ofm.type; + _opsInternal[_opsCount].dependsOn = dependsOn; + _opsCount++; + + return key; +} + +// TODO: This table is from the EthosU55/U65 Embedded NPU Interface Specification, it's not completely valid for +// Ethos U85 since the allowed data types depend on ifm/ofm as well as selected acc and scaling. +static const std::unordered_map>> s_opDataTypeSupport = { + {EthosU85NpuOp::Convolution, + { + {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + }}, + {EthosU85NpuOp::Depthwise, + { + {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + }}, + {EthosU85NpuOp::VectorProduct, + { + {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + }}, + {EthosU85NpuOp::Pooling, + { + {DataType::UInt8, {DataType::UInt8, DataType::Int32, DataType::Int64}}, + {DataType::Int8, {DataType::Int8, DataType::Int32, DataType::Int64}}, + {DataType::Int16, {DataType::Int16}}, + }}, + {EthosU85NpuOp::ReduceSum, + { + {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + {DataType::Int32, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32}}, + }}, + {EthosU85NpuOp::Dma, + { + {DataType::UInt8, {DataType::UInt8}}, + {DataType::Int8, {DataType::Int8}}, + {DataType::Int16, {DataType::Int16}}, + {DataType::Int32, {DataType::Int32}}, + }}, + {EthosU85NpuOp::Resize, + { + {DataType::UInt8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + {DataType::Int8, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + {DataType::Int16, {DataType::UInt8, DataType::Int8, DataType::Int16, DataType::Int32, DataType::Int64}}, + }}, +}; + +bool EthosU85OpGroup::CanRunOnNPU(const ArchitectureOpGroupQuery &op) +{ + EthosU85NpuOp npuOp = ArchEthosU85::GetHWOp(op.type); + + if ( IsFloat(op.ifm.type | op.ifm2.type | op.ofm.type) ) + { + return false; + } + + if ( npuOp == EthosU85NpuOp::None ) + { + return false; + } + + auto k = op.kernel; + if ( k->Stride().x > 3 || k->Stride().y > 3 ) + { + return false; + } + + if ( k->Dilation().x > 2 || k->Dilation().y > 2 ) + { + return false; + } + + switch ( npuOp ) + { + case EthosU85NpuOp::Convolution: + case EthosU85NpuOp::Depthwise: + case EthosU85NpuOp::VectorProduct: + case EthosU85NpuOp::Pooling: + case EthosU85NpuOp::ReduceSum: + case EthosU85NpuOp::Elementwise: + case EthosU85NpuOp::Resize: + case EthosU85NpuOp::Dma: + break; + default: + assert(false && "Unrecognized HWOp"); + return false; + } + + // Check allowed ifm/ofm data type mapping + if ( npuOp != EthosU85NpuOp::Elementwise ) + { + if ( op.type == OpType::LUT || op.type == OpType::MemoryCopy ) + { // TODO: LUT operations end up here due to UseAvgPoolNop although the rules are not the same as + // for a Pooling operation, so skip checks for now. + return true; + } + + auto map = s_opDataTypeSupport.find(npuOp); + if ( map == s_opDataTypeSupport.end() ) + { + assert(false && "Data type mapping for HWOp missing"); + return false; + } + auto &typeMap = map->second; + auto ifmEntry = typeMap.find(op.ifm.type); + if ( ifmEntry == typeMap.end() ) + { // Unsupported ifm data type + return false; + } + auto &ofmTypes = ifmEntry->second; + if ( 0 == std::count(ofmTypes.begin(), ofmTypes.end(), op.ofm.type) ) + { // Unsupported ofm data type + return false; + } + } + else + { + // TODO: Elementwise + } + + return true; +} + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp new file mode 100644 index 00000000..831904ec --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85.hpp @@ -0,0 +1,230 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/architecture.hpp" +#include "architecture/ethos_u_scaling.hpp" +#include "architecture/register_command_stream_generator.hpp" +#include "architecture/weight_encoder.hpp" +#include "common/bit_flags.hpp" +#include "common/shape.hpp" +#include "ethos_u85_performance.hpp" + +#include + +namespace regor +{ + +enum class EthosU85Accumulator +{ + Acc32 = 0, + Acc48 = 1, + Acc_Last = Acc48 +}; + +enum class EthosU85Traversal +{ + DepthFirst = 0, + PartKernel = 1, + Depthwise = 2, +}; + +class ArchEthosU85; + +enum class EthosU85NpuOp +{ + None = 0, + Convolution, + Depthwise, + VectorProduct, + Pooling, + ReduceSum, + Elementwise, + Resize, + Dma, +}; + +/// +/// Per-operator architecture configuration +/// +class EthosU85OpConfig : public ArchitectureOpConfig +{ + friend class ArchEthosU85; + friend class EthosU85RCSGenerator; + +private: + Shape _ifmBlock; + Shape _ofmBlock; + Shape _ofmUBlock; + EthosU85Accumulator _accumulatorType = EthosU85Accumulator::Acc32; + ArchAccumulatorSource _accumulatorSource = ArchAccumulatorSource::Reset; + bool _accumulatorOutputEnabled = true; + EthosU85Traversal _traversal = EthosU85Traversal::DepthFirst; + int _ifmRamSizeBytes = 0; + +public: + EthosU85Traversal Traversal() const { return _traversal; } + const Shape &IfmBlock() const { return _ifmBlock; } + const Shape &OfmBlock() const { return _ofmBlock; } + const Shape &OfmUBlock() const { return _ofmUBlock; } + EthosU85Accumulator Acc() const { return _accumulatorType; } + ArchAccumulatorSource AccSource() const { return _accumulatorSource; } + bool AccOutputEnabled() const { return _accumulatorOutputEnabled; } + std::unique_ptr Clone() override; + int MaxIFMBuffering() override; + Point2i OptimalStripeGranule() override; + int OptimalDepthGranule() override; + std::string ToString(bool full) override; +}; + +/// +/// Group of ops that can be fused and/or chained +/// +class EthosU85OpGroup : public ArchitectureOpGroup +{ + friend class ArchEthosU85; + + using OpInfo = ArchitectureOpGroupQuery; + + struct InternalOpInfo + { + std::vector dependsOn; + }; + +private: + std::array _ops; + std::array _opsInternal; + int _opsCount = 0; + +public: + int Add(const ArchitectureOpGroupQuery &op, const std::vector &dependsOn = {}) override; + +protected: + bool CanRunOnNPU(const ArchitectureOpGroupQuery &op) override; +}; + +/// +/// EthosU85 specialisation +/// +class ArchEthosU85 : public Architecture +{ + friend class EthosU85WeightEncoder; + friend class EthosU85Performance; + friend class EthosU85RCSGenerator; + friend class EthosU85OpGroup; + +public: + struct AcceleratorConfig + { + int macs; + int cores; + std::array ofmUBlocks; + int nOfmUBlocks; + Shape ifmUBlock; + int ifmRamSizeBytes; + int accRamSizeBytes; + int elemUnits; + int numAxiSramLog2; + int numAxiExtLog2; + const EthosU85PerfInfo *perfInfo; + }; + +private: + static constexpr int LUT_SLOT_SIZE = 256; + std::unique_ptr _lutRam; + Shape _subkernelMax; + Shape _ofmBlockMax; + int _cores = 0; + int _macs = 0; + std::array _ofmUBlocks; + int _nOfmUBlocks = 1; + // maps ofm microblock and ifmbits to a bitmask of supported operations + std::array, 3> _uBlockToOpTable{}; + // maps ofm microblock to supported IFM allocation unit + std::array, 3> _uBlockToIfmAuTable{}; + Shape _ifmUBlock; + int _ifmRamSizeBytes = 0; + int _accRamSizeBytes = 0; + int _numAxiSramLog2 = 0; + int _numAxiExtLog2 = 0; + +protected: + std::unique_ptr _weightEncoder; + std::unique_ptr _performance; + std::unique_ptr _rcsGenerator; + +public: + ArchEthosU85(); + + + bool ParseConfig(IniReader *reader) override; + + std::unique_ptr GetOpConfig(OpType opType, const ArchitectureConfigQuery &query) override; + std::unique_ptr CreateOpGroup(const ArchitectureOpGroupQuery &op) override; + class WeightEncoder *WeightEncoder() override { return _weightEncoder.get(); } + IRegisterCommandStreamGenerator *RegisterCommandStreamGenerator() override { return _rcsGenerator.get(); } + ArchitecturePerformance *Performance() override { return _performance.get(); } + TensorFormat IdealBufferingFormat() override { return TensorFormat::NHCWB16; } + Address MaxAddress() override { return 1LL << 32; } + std::vector ConfigRegisters() override; + int UpscaleAndRounding(ArchResampling resampling, int &rounding) override; + AxisMask CanSubdivide(OpType opType) override; + bool SupportsLeakyRelu(bool quantized, DataType type) override; + bool SupportsMatMul(OpType opType) override; + bool SupportsTranspose(OpType opType, TransposeType transposeType) override; + bool SupportsReverse(OpType opType, ReverseType reverseType) override; + bool SupportsGather(OpType opType) override; + bool SupportsScatter(OpType opType) override; + bool SupportsSigmoidTanhLutInt16(OpType opType) override; + bool SupportsResize(const ResizeSupportQuery &query) override; + bool SupportsAccumulatorMode(ArchAccumulatorSource source, bool outputEnabled) override; + bool SupportsScalar(OpType opType, DataType dataType, TensorUsage usage) override; + bool SupportsArgMax(OpType opType) override; + Flags SupportedWeightFormat(OpType op) override; + uint32_t Version() override; + +protected: + void ApplyConfig(const AcceleratorConfig *cfg); + + std::unique_ptr FindBlockConfig(OpType opType, const ArchitectureConfigQuery &query); + + bool TryBlockConfig(EthosU85NpuOp npuOp, const Shape &ofmBlock, const Shape &ifmBlock, const Shape &ifmShape, + int ifmBits, int accBits, int ifmSpace, int accSpace, int ifmAuDepth, int numBlocksInRam, bool isEqualDepthOp); + + Shape GetStorageRounding(TensorFormat format); + + uint32_t ConfigRegister(int product); + // Checks if the operation is to be mapped on AvgPool + static bool UseAvgPoolNop(OpType type); + static EthosU85NpuOp GetHWOp(OpType type); + +private: + int MaxOutstandingKernelOps() { return 2; } + int MaxOutstandingDMAOps() { return 4; } + int MaxBlockdep() { return 7; } + bool IsUBlockValid(const OpType opType, int ifmBits, const Shape &ofmUBlock, bool hasIfm2); + Shape FindUBlock(OpType opType, const ArchitectureConfigQuery &query); + Shape CalcIfmAUSize(int IfmBlkDepth, int ifmBits, Shape ofmUBlk); + int CalcResizeMaxOfmBlockWidth(int ifmBits, int scaleN, int scaleD); + int IndexForOfmUBlock(const Shape &ofmUBlock); + void SetupOfmUBlockToOpTable(); + void SetupOfmUBlockToIfmAuTable(); +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_interface.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_interface.hpp new file mode 100644 index 00000000..fa7e7b70 --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_interface.hpp @@ -0,0 +1,24312 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +#if !defined(__cplusplus) || __cplusplus < 201402L +#define CONSTEXPR +#else +#define CONSTEXPR constexpr +#endif + +#ifndef __cplusplus +#define STRUCT struct +#else +#define STRUCT +#endif + +#if defined(__cplusplus) && defined(NPU_DISASSEMBLE) +#include +#include +#include +#endif + +#if defined(__cplusplus) && !defined(NPU_NAMESPACE) +#define NPU_NAMESPACE npu +#endif + +#ifdef __cplusplus +#include +#include +#include +#endif + +#ifdef __cplusplus +namespace NPU_NAMESPACE +{ +#endif +#define NNX_ARCH_VERSION_MAJOR 2 +#define NNX_ARCH_VERSION_MINOR 0 +#define NNX_ARCH_VERSION_PATCH 0 + + + + + +#define NPU_REG_ID 0x0000 +#define NPU_REG_STATUS 0x0004 +#define NPU_REG_CMD 0x0008 +#define NPU_REG_RESET 0x000C +#define NPU_REG_QBASE 0x0010 +#define NPU_REG_QBASE_HI 0x0014 +#define NPU_REG_QREAD 0x0018 +#define NPU_REG_QCONFIG 0x001C +#define NPU_REG_QSIZE 0x0020 +#define NPU_REG_PROT 0x0024 +#define NPU_REG_CONFIG 0x0028 +#define NPU_REG_COND_STATUS 0x0030 +#define NPU_REG_POWER_CTRL 0x0038 +#define NPU_REG_REGIONCFG 0x003C +#define NPU_REG_MEM_ATTR_BASE 0x0040 +#define NPU_REG_MEM_ATTR_ARRLEN 0x0004 +#define NPU_REG_AXI_SRAM 0x0050 +#define NPU_REG_AXI_EXT 0x0054 +#define NPU_REG_CFG_SRAM_CAP 0x0060 +#define NPU_REG_CFG_EXT_CAP 0x0064 +#define NPU_REG_CFG_SRAM_HASH0 0x0068 +#define NPU_REG_CFG_SRAM_HASH0_HI 0x006C +#define NPU_REG_CFG_SRAM_HASH1 0x0070 +#define NPU_REG_CFG_SRAM_HASH1_HI 0x0074 +#define NPU_REG_CFG_EXT_HASH0 0x0078 +#define NPU_REG_CFG_EXT_HASH0_HI 0x007C +#define BASE_REGISTERS_SIZE 0x0080 + + + + +#define NPU_REG_BASEP_BASE 0x0080 +#define NPU_REG_BASEP_ARRLEN 0x0008 +#define BASE_POINTERS_REGISTERS_SIZE 0x0100 + + + + +#define NPU_REG_CLKFORCE 0x0140 +#define NPU_REG_DEBUG_ADDRESS 0x0144 +#define NPU_REG_DEBUG_MISC 0x0148 +#define DEBUG_REGISTERS_SIZE 0x0180 + + + + +#define NPU_REG_DMA_IFM_SRC 0x0240 +#define NPU_REG_DMA_IFM_SRC_HI 0x0244 +#define NPU_REG_DMA_IFM_DST 0x0248 +#define NPU_REG_DMA_OFM_SRC 0x024C +#define NPU_REG_DMA_OFM_DST 0x0250 +#define NPU_REG_DMA_OFM_DST_HI 0x0254 +#define NPU_REG_DMA_WEIGHT_SRC 0x0258 +#define NPU_REG_DMA_WEIGHT_SRC_HI 0x025C +#define NPU_REG_DMA_CMD_SRC 0x0260 +#define NPU_REG_DMA_CMD_SRC_HI 0x0264 +#define NPU_REG_DMA_CMD_SIZE 0x0268 +#define NPU_REG_DMA_M2M_SRC 0x026C +#define NPU_REG_DMA_M2M_SRC_HI 0x0270 +#define NPU_REG_DMA_M2M_DST 0x0274 +#define NPU_REG_DMA_M2M_DST_HI 0x0278 +#define NPU_REG_CURRENT_QREAD 0x027C +#define NPU_REG_DMA_SCALE_SRC 0x0280 +#define NPU_REG_DMA_SCALE_SRC_HI 0x0284 +#define NPU_REG_DMA_WEIGHT1_SRC 0x0288 +#define NPU_REG_DMA_WEIGHT1_SRC_HI 0x028C +#define NPU_REG_DMA_WEIGHT2_SRC 0x0290 +#define NPU_REG_DMA_WEIGHT2_SRC_HI 0x0294 +#define NPU_REG_DMA_WEIGHT3_SRC 0x0298 +#define NPU_REG_DMA_WEIGHT3_SRC_HI 0x029C +#define NPU_REG_CURRENT_OP 0x02B8 +#define NPU_REG_CURRENT_CMD 0x02BC +#define TSU_DEBUG_REGISTERS_SIZE 0x0300 + + + + +#define NPU_REG_INTERNAL_MEMORY_BASE 0x0400 +#define NPU_REG_INTERNAL_MEMORY_ARRLEN 0x0100 +#define INTERNAL_MEMORY_REGISTERS_SIZE 0x0800 + + + + +#define NPU_REG_IFM_PAD_TOP 0x0800 +#define NPU_REG_IFM_PAD_LEFT 0x0804 +#define NPU_REG_IFM_PAD_RIGHT 0x0808 +#define NPU_REG_IFM_PAD_BOTTOM 0x080C +#define NPU_REG_IFM_DEPTH_M1 0x0810 +#define NPU_REG_IFM_PRECISION 0x0814 +#define NPU_REG_IFM_UPSCALE 0x081C +#define NPU_REG_IFM_BROADCAST 0x0820 +#define NPU_REG_IFM_ZERO_POINT 0x0824 +#define NPU_REG_IFM_WIDTH0_M1 0x0828 +#define NPU_REG_IFM_HEIGHT0_M1 0x082C +#define NPU_REG_IFM_HEIGHT1_M1 0x0830 +#define NPU_REG_IFM_REGION 0x083C +#define TSU_IFM_REGISTERS_SIZE 0x0840 + + + + +#define NPU_REG_OFM_WIDTH_M1 0x0844 +#define NPU_REG_OFM_HEIGHT_M1 0x0848 +#define NPU_REG_OFM_DEPTH_M1 0x084C +#define NPU_REG_OFM_PRECISION 0x0850 +#define NPU_REG_OFM_BLK_WIDTH_M1 0x0854 +#define NPU_REG_OFM_BLK_HEIGHT_M1 0x0858 +#define NPU_REG_OFM_BLK_DEPTH_M1 0x085C +#define NPU_REG_OFM_ZERO_POINT 0x0860 +#define NPU_REG_OFM_WIDTH0_M1 0x0868 +#define NPU_REG_OFM_HEIGHT0_M1 0x086C +#define NPU_REG_OFM_HEIGHT1_M1 0x0870 +#define NPU_REG_OFM_REGION 0x087C +#define TSU_OFM_REGISTERS_SIZE 0x0880 + + + + +#define NPU_REG_KERNEL_WIDTH_M1 0x0880 +#define NPU_REG_KERNEL_HEIGHT_M1 0x0884 +#define NPU_REG_KERNEL_STRIDE 0x0888 +#define NPU_REG_ACC_FORMAT 0x0890 +#define NPU_REG_ACTIVATION 0x0894 +#define NPU_REG_ACTIVATION_MIN 0x0898 +#define NPU_REG_ACTIVATION_MAX 0x089C +#define NPU_REG_WEIGHT_REGION 0x08A0 +#define NPU_REG_SCALE_REGION 0x08A4 +#define NPU_REG_RESIZE_X_SCALE_N_M1 0x08A8 +#define NPU_REG_RESIZE_Y_SCALE_N_M1 0x08AC +#define NPU_REG_RESIZE_X_OFFSET 0x08B0 +#define NPU_REG_RESIZE_Y_OFFSET 0x08B4 +#define NPU_REG_WEIGHT_FORMAT 0x08B8 +#define NPU_REG_BLOCKDEP 0x08BC +#define TSU_KERNEL_REGISTERS_SIZE 0x08C0 + + + + +#define NPU_REG_DMA0_SRC_REGION 0x08C0 +#define NPU_REG_DMA0_DST_REGION 0x08C4 +#define NPU_REG_DMA0_SIZE0 0x08C8 +#define NPU_REG_DMA0_SIZE1 0x08CC +#define NPU_REG_DMA0_IDX_REGION 0x08D0 +#define TSU_DMA_REGISTERS_SIZE 0x0900 + + + + +#define NPU_REG_IFM2_BROADCAST 0x0900 +#define NPU_REG_IFM2_PRECISION 0x0914 +#define NPU_REG_IFM2_ZERO_POINT 0x0924 +#define NPU_REG_IFM2_WIDTH0_M1 0x0928 +#define NPU_REG_IFM2_HEIGHT0_M1 0x092C +#define NPU_REG_IFM2_HEIGHT1_M1 0x0930 +#define NPU_REG_IFM2_REGION 0x093C +#define TSU_IFM2_REGISTERS_SIZE 0x0940 + + + + +#define NPU_REG_IFM_BASE0 0x0A00 +#define NPU_REG_IFM_BASE0_HI 0x0A04 +#define NPU_REG_IFM_BASE1 0x0A08 +#define NPU_REG_IFM_BASE1_HI 0x0A0C +#define NPU_REG_IFM_BASE2 0x0A10 +#define NPU_REG_IFM_BASE2_HI 0x0A14 +#define NPU_REG_IFM_BASE3 0x0A18 +#define NPU_REG_IFM_BASE3_HI 0x0A1C +#define NPU_REG_IFM_STRIDE_X 0x0A20 +#define NPU_REG_IFM_STRIDE_X_HI 0x0A24 +#define NPU_REG_IFM_STRIDE_Y 0x0A28 +#define NPU_REG_IFM_STRIDE_Y_HI 0x0A2C +#define NPU_REG_IFM_STRIDE_C 0x0A30 +#define NPU_REG_IFM_STRIDE_C_HI 0x0A34 +#define TSU_IFM_BASE_REGISTERS_SIZE 0x0A40 + + + + +#define NPU_REG_OFM_BASE0 0x0A40 +#define NPU_REG_OFM_BASE0_HI 0x0A44 +#define NPU_REG_OFM_BASE1 0x0A48 +#define NPU_REG_OFM_BASE1_HI 0x0A4C +#define NPU_REG_OFM_BASE2 0x0A50 +#define NPU_REG_OFM_BASE2_HI 0x0A54 +#define NPU_REG_OFM_BASE3 0x0A58 +#define NPU_REG_OFM_BASE3_HI 0x0A5C +#define NPU_REG_OFM_STRIDE_X 0x0A60 +#define NPU_REG_OFM_STRIDE_X_HI 0x0A64 +#define NPU_REG_OFM_STRIDE_Y 0x0A68 +#define NPU_REG_OFM_STRIDE_Y_HI 0x0A6C +#define NPU_REG_OFM_STRIDE_C 0x0A70 +#define NPU_REG_OFM_STRIDE_C_HI 0x0A74 +#define TSU_OFM_BASE_REGISTERS_SIZE 0x0A80 + + + + +#define NPU_REG_WEIGHT_BASE 0x0A80 +#define NPU_REG_WEIGHT_BASE_HI 0x0A84 +#define NPU_REG_WEIGHT_LENGTH 0x0A88 +#define NPU_REG_WEIGHT_LENGTH_HI 0x0A8C +#define NPU_REG_SCALE_BASE 0x0A90 +#define NPU_REG_SCALE_BASE_HI 0x0A94 +#define NPU_REG_SCALE_LENGTH 0x0A98 +#define NPU_REG_SCALE_LENGTH_HI 0x0A9C +#define NPU_REG_OFM_SCALE 0x0AA0 +#define NPU_REG_OFM_SCALE_HI 0x0AA4 +#define NPU_REG_IFM_SCALE 0x0AA8 +#define NPU_REG_IFM_SCALE_HI 0x0AAC +#define NPU_REG_IFM2_SCALE 0x0AB0 +#define NPU_REG_IFM2_SCALE_HI 0x0AB4 +#define NPU_REG_OP_SCALAR 0x0AB8 +#define NPU_REG_OP_SCALAR_HI 0x0ABC +#define TSU_WS_BASE_REGISTERS_SIZE 0x0AC0 + + + + +#define NPU_REG_DMA0_SRC 0x0AC0 +#define NPU_REG_DMA0_SRC_HI 0x0AC4 +#define NPU_REG_DMA0_DST 0x0AC8 +#define NPU_REG_DMA0_DST_HI 0x0ACC +#define NPU_REG_DMA0_LEN 0x0AD0 +#define NPU_REG_DMA0_LEN_HI 0x0AD4 +#define NPU_REG_DMA0_SRC_STRIDE0 0x0AD8 +#define NPU_REG_DMA0_SRC_STRIDE0_HI 0x0ADC +#define NPU_REG_DMA0_SRC_STRIDE1 0x0AE0 +#define NPU_REG_DMA0_SRC_STRIDE1_HI 0x0AE4 +#define NPU_REG_DMA0_DST_STRIDE0 0x0AE8 +#define NPU_REG_DMA0_DST_STRIDE0_HI 0x0AEC +#define NPU_REG_DMA0_DST_STRIDE1 0x0AF0 +#define NPU_REG_DMA0_DST_STRIDE1_HI 0x0AF4 +#define NPU_REG_DMA0_IDX 0x0AF8 +#define NPU_REG_DMA0_IDX_HI 0x0AFC +#define TSU_DMA_BASE_REGISTERS_SIZE 0x0B00 + + + + +#define NPU_REG_IFM2_BASE0 0x0B00 +#define NPU_REG_IFM2_BASE0_HI 0x0B04 +#define NPU_REG_IFM2_BASE1 0x0B08 +#define NPU_REG_IFM2_BASE1_HI 0x0B0C +#define NPU_REG_IFM2_BASE2 0x0B10 +#define NPU_REG_IFM2_BASE2_HI 0x0B14 +#define NPU_REG_IFM2_BASE3 0x0B18 +#define NPU_REG_IFM2_BASE3_HI 0x0B1C +#define NPU_REG_IFM2_STRIDE_X 0x0B20 +#define NPU_REG_IFM2_STRIDE_X_HI 0x0B24 +#define NPU_REG_IFM2_STRIDE_Y 0x0B28 +#define NPU_REG_IFM2_STRIDE_Y_HI 0x0B2C +#define NPU_REG_IFM2_STRIDE_C 0x0B30 +#define NPU_REG_IFM2_STRIDE_C_HI 0x0B34 +#define TSU_IFM2_BASE_REGISTERS_SIZE 0x0B40 + + + + +#define NPU_REG_WEIGHT1_BASE 0x0B40 +#define NPU_REG_WEIGHT1_BASE_HI 0x0B44 +#define NPU_REG_WEIGHT1_LENGTH 0x0B48 +#define NPU_REG_WEIGHT1_LENGTH_HI 0x0B4C +#define NPU_REG_WEIGHT2_BASE 0x0B50 +#define NPU_REG_WEIGHT2_BASE_HI 0x0B54 +#define NPU_REG_WEIGHT2_LENGTH 0x0B58 +#define NPU_REG_WEIGHT2_LENGTH_HI 0x0B5C +#define NPU_REG_WEIGHT3_BASE 0x0B60 +#define NPU_REG_WEIGHT3_BASE_HI 0x0B64 +#define NPU_REG_WEIGHT3_LENGTH 0x0B68 +#define NPU_REG_WEIGHT3_LENGTH_HI 0x0B6C +#define NPU_REG_RESIZE_X_STEP 0x0B70 +#define NPU_REG_RESIZE_X_STEP_HI 0x0B74 +#define NPU_REG_RESIZE_Y_STEP 0x0B78 +#define NPU_REG_RESIZE_Y_STEP_HI 0x0B7C +#define TSU_WS1_BASE_REGISTERS_SIZE 0x0B80 + + + + +#define TSU_USER_BASE_REGISTERS_SIZE 0x0BC0 + + + + +#define NPU_REG_DMA0_IDX_MAX 0x0BC0 +#define NPU_REG_DMA0_IDX_MAX_HI 0x0BC4 +#define NPU_REG_DMA0_IDX_SKIP1 0x0BC8 +#define NPU_REG_DMA0_IDX_SKIP1_HI 0x0BCC +#define TSU_DMA_EBASE_REGISTERS_SIZE 0x0C00 + + + + +#define NPU_REG_REVISION 0x0FC0 +#define NPU_REG_PID4 0x0FD0 +#define NPU_REG_PID5 0x0FD4 +#define NPU_REG_PID6 0x0FD8 +#define NPU_REG_PID7 0x0FDC +#define NPU_REG_PID0 0x0FE0 +#define NPU_REG_PID1 0x0FE4 +#define NPU_REG_PID2 0x0FE8 +#define NPU_REG_PID3 0x0FEC +#define NPU_REG_CID0 0x0FF0 +#define NPU_REG_CID1 0x0FF4 +#define NPU_REG_CID2 0x0FF8 +#define NPU_REG_CID3 0x0FFC +#define ID_REGISTERS_SIZE 0x1000 + + + + +#define NPU_REG_WD_STATUS 0x1100 +#define NPU_REG_MAC_STATUS 0x1104 +#define NPU_REG_AO_STATUS 0x1108 +#define NPU_REG_DMA_STATUS0 0x1110 +#define NPU_REG_DMA_STATUS1 0x1114 +#define DEBUG_STATUS_REGISTERS_SIZE 0x1180 + + + + +#define NPU_REG_PMCR 0x1180 +#define NPU_REG_PMCNTENSET 0x1184 +#define NPU_REG_PMCNTENCLR 0x1188 +#define NPU_REG_PMOVSSET 0x118C +#define NPU_REG_PMOVSCLR 0x1190 +#define NPU_REG_PMINTSET 0x1194 +#define NPU_REG_PMINTCLR 0x1198 +#define NPU_REG_PMCCNTR 0x11A0 +#define NPU_REG_PMCCNTR_HI 0x11A4 +#define NPU_REG_PMCCNTR_CFG 0x11A8 +#define NPU_REG_PMCAXI_CHAN 0x11AC +#define NPU_REG_PMCLUT 0x11B0 +#define PMU_REGISTERS_SIZE 0x1200 + + + + +#define NPU_REG_PMEVCNTR_BASE 0x1300 +#define NPU_REG_PMEVCNTR_ARRLEN 0x0008 +#define NPU_REG_PMEVTYPER_BASE 0x1380 +#define NPU_REG_PMEVTYPER_ARRLEN 0x0008 +#define PMU_COUNTERS_REGISTERS_SIZE 0x1400 + +#ifdef __cplusplus + +enum class acc_format : uint8_t +{ + I32 = 0, + I48 = 1, +}; + +enum class acc_input : uint8_t +{ + RESET = 0, + KEEP = 1, + IFM2 = 2, +}; + +enum class acc_output : uint8_t +{ + ENABLE = 0, + DISABLE = 1, +}; + +enum class activation_clip_range : uint8_t +{ + B16 = 0, + NONE = 1, +}; + +enum class activation_format : uint8_t +{ + NHWC = 0, + NHCWB16 = 1, +}; + +enum class activation_function : uint8_t +{ + LUT_NONE = 0, + LUT_U8_U8 = 1, + LUT_S8_S8 = 4, + LUT_S8_S16 = 5, + LUT_S8_S32 = 7, + LUT_S16_S16 = 8, + LUT_S16_S32 = 9, + LUT_TANH = 10, + LUT_SIGMOID = 11, +}; + +enum class activation_precision : uint8_t +{ + B8 = 0, + B16 = 1, + B32 = 2, + B64 = 3, +}; + +enum class activation_reverse : uint8_t +{ + NONE = 0, + H = 1, + W = 2, + C = 3, +}; + +enum class activation_storage : uint8_t +{ + TILE2X2 = 0, + TILE3X1 = 1, + CHAINED = 2, + NONE = 3, +}; + +enum class activation_transpose : uint8_t +{ + HWC = 0, + WHC = 1, + HCW = 2, + WCH = 3, + CHW = 6, + CWH = 7, +}; + +enum class activation_type : uint8_t +{ + UNSIGNED = 0, + SIGNED = 1, +}; + +enum class axi_mem_domain : uint8_t +{ + NON_SHARABLE = 0, + INNER_SHARABLE = 1, + OUTER_SHARABLE = 2, + SYSTEM = 3, +}; + +enum class axi_mem_encoding : uint8_t +{ + DEVICE_NON_BUFFERABLE = 0, + DEVICE_BUFFERABLE = 1, + NORMAL_NON_CACHEABLE_NON_BUFFERABLE = 2, + NORMAL_NON_CACHEABLE_BUFFERABLE = 3, + WRITE_THROUGH_NO_ALLOCATE = 4, + WRITE_THROUGH_READ_ALLOCATE = 5, + WRITE_THROUGH_WRITE_ALLOCATE = 6, + WRITE_THROUGH_READ_AND_WRITE_ALLOCATE = 7, + WRITE_BACK_NO_ALLOCATE = 8, + WRITE_BACK_READ_ALLOCATE = 9, + WRITE_BACK_WRITE_ALLOCATE = 10, + WRITE_BACK_READ_AND_WRITE_ALLOCATE = 11, +}; + +enum class axi_port : uint8_t +{ + SRAM = 0, + EXT = 1, +}; + +enum class branch_cond : uint8_t +{ + ALWAYS = 0, + RF_TRUE = 1, +}; + +enum class broadcast_mode : uint8_t +{ + NONE = 0, + H = 1, + W = 2, + HW = 3, + C = 4, + CH = 5, + CW = 6, + CWH = 7, + SCALAR = 8, +}; + +enum class cmd0_opcode : uint16_t +{ + NPU_OP_STOP = 0, + NPU_OP_IRQ = 1, + NPU_OP_CONV = 2, + NPU_OP_DEPTHWISE = 3, + NPU_OP_POOL = 5, + NPU_OP_ELEMENTWISE = 6, + NPU_OP_RESIZE = 7, + NPU_OP_DMA_START = 16, + NPU_OP_DMA_WAIT = 17, + NPU_OP_KERNEL_WAIT = 18, + NPU_OP_PMU_MASK = 19, + NPU_SET_IFM_PAD_TOP = 256, + NPU_SET_IFM_PAD_LEFT = 257, + NPU_SET_IFM_PAD_RIGHT = 258, + NPU_SET_IFM_PAD_BOTTOM = 259, + NPU_SET_IFM_DEPTH_M1 = 260, + NPU_SET_IFM_PRECISION = 261, + NPU_SET_IFM_UPSCALE = 263, + NPU_SET_IFM_BROADCAST = 264, + NPU_SET_IFM_ZERO_POINT = 265, + NPU_SET_IFM_WIDTH0_M1 = 266, + NPU_SET_IFM_HEIGHT0_M1 = 267, + NPU_SET_IFM_HEIGHT1_M1 = 268, + NPU_SET_IFM_REGION = 271, + NPU_SET_OFM_WIDTH_M1 = 273, + NPU_SET_OFM_HEIGHT_M1 = 274, + NPU_SET_OFM_DEPTH_M1 = 275, + NPU_SET_OFM_PRECISION = 276, + NPU_SET_OFM_BLK_WIDTH_M1 = 277, + NPU_SET_OFM_BLK_HEIGHT_M1 = 278, + NPU_SET_OFM_BLK_DEPTH_M1 = 279, + NPU_SET_OFM_ZERO_POINT = 280, + NPU_SET_OFM_WIDTH0_M1 = 282, + NPU_SET_OFM_HEIGHT0_M1 = 283, + NPU_SET_OFM_HEIGHT1_M1 = 284, + NPU_SET_OFM_REGION = 287, + NPU_SET_KERNEL_WIDTH_M1 = 288, + NPU_SET_KERNEL_HEIGHT_M1 = 289, + NPU_SET_KERNEL_STRIDE = 290, + NPU_SET_ACC_FORMAT = 292, + NPU_SET_ACTIVATION = 293, + NPU_SET_ACTIVATION_MIN = 294, + NPU_SET_ACTIVATION_MAX = 295, + NPU_SET_WEIGHT_REGION = 296, + NPU_SET_SCALE_REGION = 297, + NPU_SET_RESIZE_X_SCALE_N_M1 = 298, + NPU_SET_RESIZE_Y_SCALE_N_M1 = 299, + NPU_SET_RESIZE_X_OFFSET = 300, + NPU_SET_RESIZE_Y_OFFSET = 301, + NPU_SET_WEIGHT_FORMAT = 302, + NPU_SET_BLOCKDEP = 303, + NPU_SET_DMA0_SRC_REGION = 304, + NPU_SET_DMA0_DST_REGION = 305, + NPU_SET_DMA0_SIZE0 = 306, + NPU_SET_DMA0_SIZE1 = 307, + NPU_SET_DMA0_IDX_REGION = 308, + NPU_SET_IFM2_BROADCAST = 384, + NPU_SET_IFM2_PRECISION = 389, + NPU_SET_IFM2_ZERO_POINT = 393, + NPU_SET_IFM2_WIDTH0_M1 = 394, + NPU_SET_IFM2_HEIGHT0_M1 = 395, + NPU_SET_IFM2_HEIGHT1_M1 = 396, + NPU_SET_IFM2_REGION = 399, +}; + +enum class cmd1_opcode : uint16_t +{ + NPU_SET_IFM_BASE0 = 0, + NPU_SET_IFM_BASE1 = 1, + NPU_SET_IFM_BASE2 = 2, + NPU_SET_IFM_BASE3 = 3, + NPU_SET_IFM_STRIDE_X = 4, + NPU_SET_IFM_STRIDE_Y = 5, + NPU_SET_IFM_STRIDE_C = 6, + NPU_SET_OFM_BASE0 = 16, + NPU_SET_OFM_BASE1 = 17, + NPU_SET_OFM_BASE2 = 18, + NPU_SET_OFM_BASE3 = 19, + NPU_SET_OFM_STRIDE_X = 20, + NPU_SET_OFM_STRIDE_Y = 21, + NPU_SET_OFM_STRIDE_C = 22, + NPU_SET_WEIGHT_BASE = 32, + NPU_SET_WEIGHT_LENGTH = 33, + NPU_SET_SCALE_BASE = 34, + NPU_SET_SCALE_LENGTH = 35, + NPU_SET_OFM_SCALE = 36, + NPU_SET_IFM_SCALE = 37, + NPU_SET_IFM2_SCALE = 38, + NPU_SET_OP_SCALAR = 39, + NPU_SET_DMA0_SRC = 48, + NPU_SET_DMA0_DST = 49, + NPU_SET_DMA0_LEN = 50, + NPU_SET_DMA0_SRC_STRIDE0 = 51, + NPU_SET_DMA0_SRC_STRIDE1 = 52, + NPU_SET_DMA0_DST_STRIDE0 = 53, + NPU_SET_DMA0_DST_STRIDE1 = 54, + NPU_SET_DMA0_IDX = 55, + NPU_SET_DMA0_IDX_MAX = 56, + NPU_SET_DMA0_IDX_SKIP1 = 57, + NPU_SET_IFM2_BASE0 = 128, + NPU_SET_IFM2_BASE1 = 129, + NPU_SET_IFM2_BASE2 = 130, + NPU_SET_IFM2_BASE3 = 131, + NPU_SET_IFM2_STRIDE_X = 132, + NPU_SET_IFM2_STRIDE_Y = 133, + NPU_SET_IFM2_STRIDE_C = 134, + NPU_SET_WEIGHT1_BASE = 144, + NPU_SET_WEIGHT1_LENGTH = 145, + NPU_SET_WEIGHT2_BASE = 146, + NPU_SET_WEIGHT2_LENGTH = 147, + NPU_SET_WEIGHT3_BASE = 148, + NPU_SET_WEIGHT3_LENGTH = 149, + NPU_SET_RESIZE_X = 150, + NPU_SET_RESIZE_Y = 151, + NPU_OP_BRANCH = 256, +}; + +enum class cmd_ctrl : uint8_t +{ + CMD0_CTRL = 0, + CMD1_CTRL = 1, +}; + +enum class custom_dma : uint8_t +{ + NOT_IMPLEMENTED = 0, + IMPLEMENTED = 1, +}; + +enum class dma_fault_channel : uint8_t +{ + CMD_READ = 0, + IFM_READ = 1, + WEIGHT_READ = 2, + SBS_READ = 3, + MEM2MEM_READ = 4, + OFM_WRITE = 8, + MEM2MEM_WRITE = 9, +}; + +enum class dma_fault_src : uint8_t +{ + SRAM = 0, + EXT = 1, +}; + +enum class dma_idx_mode : uint8_t +{ + DISABLED = 0, + ENABLED = 1, +}; + +enum class dma_region_mode : uint8_t +{ + EXTERNAL = 0, + INTERNAL = 1, +}; + +enum class dma_stride_mode : uint8_t +{ + D1 = 0, + D2 = 1, + D3 = 2, +}; + +enum class elementwise_mode : uint8_t +{ + MUL = 0, + ADD = 1, + SUB = 2, + MIN = 3, + MAX = 4, + LRELU = 5, + ABS = 6, + CLZ = 7, + SHR = 8, + SHL = 9, + LSR = 10, + DIV = 11, + CMP_EQ = 16, + CMP_NE = 17, + CMP_GE = 18, + CMP_GT = 19, + AND = 33, + OR = 34, + XOR = 35, + NOT = 36, + AND_NOT = 42, +}; + +enum class ifm_upscale_mode : uint8_t +{ + NONE = 0, + NEAREST = 1, + ZEROS = 2, +}; + +enum class kernel_decomposition : uint8_t +{ + D8X8 = 0, + D4X4 = 1, +}; + +enum class kernel_dilation : uint8_t +{ + NONE = 0, + X2 = 1, +}; + +enum class max_beats : uint8_t +{ + B64 = 0, + B128 = 1, + B256 = 2, +}; + +enum class microblock : uint8_t +{ + U1X1 = 0, + U1X2 = 1, + U1X4 = 2, + U2X2 = 3, + U2X4 = 4, + U4X4 = 5, +}; + +enum class ofm_scale_mode : uint8_t +{ + PER_CHANNEL = 0, + GLOBAL = 1, +}; + +enum class pmu_axi_channel : uint8_t +{ + RD_CMD = 0, + RD_IFM = 1, + RD_WEIGHTS = 2, + RD_SCALE_BIAS = 3, + RD_MEM2MEM = 4, + RD_IFM_STREAM = 5, + RD_MEM2MEM_IDX = 6, + WR_OFM = 8, + WR_MEM2MEM = 9, +}; + +enum class pmu_event : uint16_t +{ + NO_EVENT = 0, + CYCLE = 17, + NPU_IDLE = 32, + CC_STALLED_ON_BLOCKDEP = 33, + CC_STALLED_ON_SHRAM_RECONFIG = 34, + NPU_ACTIVE = 35, + MAC_ACTIVE = 48, + MAC_DPU_ACTIVE = 51, + MAC_STALLED_BY_W_OR_ACC = 52, + MAC_STALLED_BY_W = 53, + MAC_STALLED_BY_ACC = 54, + MAC_STALLED_BY_IB = 55, + MAC_STALLED_BY_INT_W = 57, + MAC_STALLED_BY_INT_ACC = 58, + AO_ACTIVE = 64, + AO_STALLED_BY_BS_OR_OB = 67, + AO_STALLED_BY_BS = 68, + AO_STALLED_BY_OB = 69, + AO_STALLED_BY_AB_OR_CB = 70, + AO_STALLED_BY_AB = 71, + AO_STALLED_BY_CB = 72, + WD_ACTIVE = 80, + WD_STALLED = 81, + WD_STALLED_BY_WD_BUF = 83, + WD_STALLED_BY_WS_FC = 84, + WD_STALLED_BY_WS_TC = 85, + WD_TRANS_WBLK = 89, + WD_TRANS_WS_FC = 90, + WD_TRANS_WS_TC = 91, + WD_STALLED_BY_WS_SC0 = 96, + WD_STALLED_BY_WS_SC1 = 97, + WD_STALLED_BY_WS_SC2 = 98, + WD_STALLED_BY_WS_SC3 = 99, + WD_PARSE_ACTIVE_SC0 = 100, + WD_PARSE_ACTIVE_SC1 = 101, + WD_PARSE_ACTIVE_SC2 = 102, + WD_PARSE_ACTIVE_SC3 = 103, + WD_PARSE_STALL_SC0 = 104, + WD_PARSE_STALL_SC1 = 105, + WD_PARSE_STALL_SC2 = 106, + WD_PARSE_STALL_SC3 = 107, + WD_PARSE_STALL_IN_SC0 = 108, + WD_PARSE_STALL_IN_SC1 = 109, + WD_PARSE_STALL_IN_SC2 = 110, + WD_PARSE_STALL_IN_SC3 = 111, + WD_PARSE_STALL_OUT_SC0 = 112, + WD_PARSE_STALL_OUT_SC1 = 113, + WD_PARSE_STALL_OUT_SC2 = 114, + WD_PARSE_STALL_OUT_SC3 = 115, + WD_TRANS_WS_SC0 = 116, + WD_TRANS_WS_SC1 = 117, + WD_TRANS_WS_SC2 = 118, + WD_TRANS_WS_SC3 = 119, + WD_TRANS_WB0 = 120, + WD_TRANS_WB1 = 121, + WD_TRANS_WB2 = 122, + WD_TRANS_WB3 = 123, + SRAM_RD_TRANS_ACCEPTED = 128, + SRAM_RD_TRANS_COMPLETED = 129, + SRAM_RD_DATA_BEAT_RECEIVED = 130, + SRAM_RD_TRAN_REQ_STALLED = 131, + SRAM_WR_TRANS_ACCEPTED = 132, + SRAM_WR_TRANS_COMPLETED_M = 133, + SRAM_WR_TRANS_COMPLETED_S = 134, + SRAM_WR_DATA_BEAT_WRITTEN = 135, + SRAM_WR_TRAN_REQ_STALLED = 136, + SRAM_WR_DATA_BEAT_STALLED = 137, + SRAM_ENABLED_CYCLES = 140, + SRAM_RD_STALL_LIMIT = 142, + SRAM_WR_STALL_LIMIT = 143, + AXI_LATENCY_ANY = 160, + AXI_LATENCY_32 = 161, + AXI_LATENCY_64 = 162, + AXI_LATENCY_128 = 163, + AXI_LATENCY_256 = 164, + AXI_LATENCY_512 = 165, + AXI_LATENCY_1024 = 166, + ECC_DMA = 176, + ECC_MAC_IB = 177, + ECC_MAC_AB = 178, + ECC_AO_CB = 179, + ECC_AO_OB = 180, + ECC_AO_LUT = 181, + EXT_RD_TRANS_ACCEPTED = 384, + EXT_RD_TRANS_COMPLETED = 385, + EXT_RD_DATA_BEAT_RECEIVED = 386, + EXT_RD_TRAN_REQ_STALLED = 387, + EXT_WR_TRANS_ACCEPTED = 388, + EXT_WR_TRANS_COMPLETED_M = 389, + EXT_WR_TRANS_COMPLETED_S = 390, + EXT_WR_DATA_BEAT_WRITTEN = 391, + EXT_WR_TRAN_REQ_STALLED = 392, + EXT_WR_DATA_BEAT_STALLED = 393, + EXT_ENABLED_CYCLES = 396, + EXT_RD_STALL_LIMIT = 398, + EXT_WR_STALL_LIMIT = 399, +}; + +enum class pmu_port_disable : uint8_t +{ + ENABLE = 0, + DISABLE = 1, +}; + +enum class pooling_mode : uint8_t +{ + MAX = 0, + AVERAGE = 1, + REDUCE_SUM = 2, + SUM = 3, + NONE = 4, + MIN = 5, + ARGMAX_X = 6, + ARGMAX_Y = 7, +}; + +enum class privilege_level : uint8_t +{ + USER = 0, + PRIVILEGED = 1, +}; + +enum class ram_id : uint8_t +{ + LUT = 0, + IB = 1, + AB = 2, + CB = 3, + OB = 4, +}; + +enum class resize_mode : uint8_t +{ + BILINEAR = 0, + REPLICATE = 1, + NEAREST = 2, +}; + +enum class round_mode_ifm : uint8_t +{ + DOUBLE_SYMMETRIC = 0, + NATURAL = 1, +}; + +enum class round_mode_ofm : uint8_t +{ + DOUBLE_SYMMETRIC = 0, + NATURAL = 1, + DOUBLE_ASYMMETRIC = 2, + SYMMETRIC = 3, + TRUNCATE_TO_ZERO = 4, + TRUNCATE_TO_LOWER = 5, +}; + +enum class security_level : uint8_t +{ + SECURE = 0, + NON_SECURE = 1, +}; + +enum class state : uint8_t +{ + STOPPED = 0, + RUNNING = 1, +}; + +enum class wd_active_core : uint8_t +{ + NONE = 0, + STANDARD = 1, + FAST = 2, + TENSOR = 3, +}; + +enum class weight_format : uint8_t +{ + SWD = 0, + FWD = 1, +}; + +enum class weight_order : uint8_t +{ + DEPTH_FIRST = 0, + PART_KERNEL_FIRST = 1, +}; + +enum class weight_sparsity : uint8_t +{ + NONE = 0, + SPARSE_2_4 = 1, +}; + +#else + +enum acc_format +{ + ACC_FORMAT_I32 = 0, + ACC_FORMAT_I48 = 1, +}; + +enum acc_input +{ + ACC_INPUT_RESET = 0, + ACC_INPUT_KEEP = 1, + ACC_INPUT_IFM2 = 2, +}; + +enum acc_output +{ + ACC_OUTPUT_ENABLE = 0, + ACC_OUTPUT_DISABLE = 1, +}; + +enum activation_clip_range +{ + ACTIVATION_CLIP_RANGE_B16 = 0, + ACTIVATION_CLIP_RANGE_NONE = 1, +}; + +enum activation_format +{ + ACTIVATION_FORMAT_NHWC = 0, + ACTIVATION_FORMAT_NHCWB16 = 1, +}; + +enum activation_function +{ + ACTIVATION_FUNCTION_LUT_NONE = 0, + ACTIVATION_FUNCTION_LUT_U8_U8 = 1, + ACTIVATION_FUNCTION_LUT_S8_S8 = 4, + ACTIVATION_FUNCTION_LUT_S8_S16 = 5, + ACTIVATION_FUNCTION_LUT_S8_S32 = 7, + ACTIVATION_FUNCTION_LUT_S16_S16 = 8, + ACTIVATION_FUNCTION_LUT_S16_S32 = 9, + ACTIVATION_FUNCTION_LUT_TANH = 10, + ACTIVATION_FUNCTION_LUT_SIGMOID = 11, +}; + +enum activation_precision +{ + ACTIVATION_PRECISION_B8 = 0, + ACTIVATION_PRECISION_B16 = 1, + ACTIVATION_PRECISION_B32 = 2, + ACTIVATION_PRECISION_B64 = 3, +}; + +enum activation_reverse +{ + ACTIVATION_REVERSE_NONE = 0, + ACTIVATION_REVERSE_H = 1, + ACTIVATION_REVERSE_W = 2, + ACTIVATION_REVERSE_C = 3, +}; + +enum activation_storage +{ + ACTIVATION_STORAGE_TILE2X2 = 0, + ACTIVATION_STORAGE_TILE3X1 = 1, + ACTIVATION_STORAGE_CHAINED = 2, + ACTIVATION_STORAGE_NONE = 3, +}; + +enum activation_transpose +{ + ACTIVATION_TRANSPOSE_HWC = 0, + ACTIVATION_TRANSPOSE_WHC = 1, + ACTIVATION_TRANSPOSE_HCW = 2, + ACTIVATION_TRANSPOSE_WCH = 3, + ACTIVATION_TRANSPOSE_CHW = 6, + ACTIVATION_TRANSPOSE_CWH = 7, +}; + +enum activation_type +{ + ACTIVATION_TYPE_UNSIGNED = 0, + ACTIVATION_TYPE_SIGNED = 1, +}; + +enum axi_mem_domain +{ + AXI_MEM_DOMAIN_NON_SHARABLE = 0, + AXI_MEM_DOMAIN_INNER_SHARABLE = 1, + AXI_MEM_DOMAIN_OUTER_SHARABLE = 2, + AXI_MEM_DOMAIN_SYSTEM = 3, +}; + +enum axi_mem_encoding +{ + AXI_MEM_ENCODING_DEVICE_NON_BUFFERABLE = 0, + AXI_MEM_ENCODING_DEVICE_BUFFERABLE = 1, + AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_NON_BUFFERABLE = 2, + AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_BUFFERABLE = 3, + AXI_MEM_ENCODING_WRITE_THROUGH_NO_ALLOCATE = 4, + AXI_MEM_ENCODING_WRITE_THROUGH_READ_ALLOCATE = 5, + AXI_MEM_ENCODING_WRITE_THROUGH_WRITE_ALLOCATE = 6, + AXI_MEM_ENCODING_WRITE_THROUGH_READ_AND_WRITE_ALLOCATE = 7, + AXI_MEM_ENCODING_WRITE_BACK_NO_ALLOCATE = 8, + AXI_MEM_ENCODING_WRITE_BACK_READ_ALLOCATE = 9, + AXI_MEM_ENCODING_WRITE_BACK_WRITE_ALLOCATE = 10, + AXI_MEM_ENCODING_WRITE_BACK_READ_AND_WRITE_ALLOCATE = 11, +}; + +enum axi_port +{ + AXI_PORT_SRAM = 0, + AXI_PORT_EXT = 1, +}; + +enum branch_cond +{ + BRANCH_COND_ALWAYS = 0, + BRANCH_COND_RF_TRUE = 1, +}; + +enum broadcast_mode +{ + BROADCAST_MODE_NONE = 0, + BROADCAST_MODE_H = 1, + BROADCAST_MODE_W = 2, + BROADCAST_MODE_HW = 3, + BROADCAST_MODE_C = 4, + BROADCAST_MODE_CH = 5, + BROADCAST_MODE_CW = 6, + BROADCAST_MODE_CWH = 7, + BROADCAST_MODE_SCALAR = 8, +}; + +enum cmd0_opcode +{ + CMD0_OPCODE_NPU_OP_STOP = 0, + CMD0_OPCODE_NPU_OP_IRQ = 1, + CMD0_OPCODE_NPU_OP_CONV = 2, + CMD0_OPCODE_NPU_OP_DEPTHWISE = 3, + CMD0_OPCODE_NPU_OP_POOL = 5, + CMD0_OPCODE_NPU_OP_ELEMENTWISE = 6, + CMD0_OPCODE_NPU_OP_RESIZE = 7, + CMD0_OPCODE_NPU_OP_DMA_START = 16, + CMD0_OPCODE_NPU_OP_DMA_WAIT = 17, + CMD0_OPCODE_NPU_OP_KERNEL_WAIT = 18, + CMD0_OPCODE_NPU_OP_PMU_MASK = 19, + CMD0_OPCODE_NPU_SET_IFM_PAD_TOP = 256, + CMD0_OPCODE_NPU_SET_IFM_PAD_LEFT = 257, + CMD0_OPCODE_NPU_SET_IFM_PAD_RIGHT = 258, + CMD0_OPCODE_NPU_SET_IFM_PAD_BOTTOM = 259, + CMD0_OPCODE_NPU_SET_IFM_DEPTH_M1 = 260, + CMD0_OPCODE_NPU_SET_IFM_PRECISION = 261, + CMD0_OPCODE_NPU_SET_IFM_UPSCALE = 263, + CMD0_OPCODE_NPU_SET_IFM_BROADCAST = 264, + CMD0_OPCODE_NPU_SET_IFM_ZERO_POINT = 265, + CMD0_OPCODE_NPU_SET_IFM_WIDTH0_M1 = 266, + CMD0_OPCODE_NPU_SET_IFM_HEIGHT0_M1 = 267, + CMD0_OPCODE_NPU_SET_IFM_HEIGHT1_M1 = 268, + CMD0_OPCODE_NPU_SET_IFM_REGION = 271, + CMD0_OPCODE_NPU_SET_OFM_WIDTH_M1 = 273, + CMD0_OPCODE_NPU_SET_OFM_HEIGHT_M1 = 274, + CMD0_OPCODE_NPU_SET_OFM_DEPTH_M1 = 275, + CMD0_OPCODE_NPU_SET_OFM_PRECISION = 276, + CMD0_OPCODE_NPU_SET_OFM_BLK_WIDTH_M1 = 277, + CMD0_OPCODE_NPU_SET_OFM_BLK_HEIGHT_M1 = 278, + CMD0_OPCODE_NPU_SET_OFM_BLK_DEPTH_M1 = 279, + CMD0_OPCODE_NPU_SET_OFM_ZERO_POINT = 280, + CMD0_OPCODE_NPU_SET_OFM_WIDTH0_M1 = 282, + CMD0_OPCODE_NPU_SET_OFM_HEIGHT0_M1 = 283, + CMD0_OPCODE_NPU_SET_OFM_HEIGHT1_M1 = 284, + CMD0_OPCODE_NPU_SET_OFM_REGION = 287, + CMD0_OPCODE_NPU_SET_KERNEL_WIDTH_M1 = 288, + CMD0_OPCODE_NPU_SET_KERNEL_HEIGHT_M1 = 289, + CMD0_OPCODE_NPU_SET_KERNEL_STRIDE = 290, + CMD0_OPCODE_NPU_SET_ACC_FORMAT = 292, + CMD0_OPCODE_NPU_SET_ACTIVATION = 293, + CMD0_OPCODE_NPU_SET_ACTIVATION_MIN = 294, + CMD0_OPCODE_NPU_SET_ACTIVATION_MAX = 295, + CMD0_OPCODE_NPU_SET_WEIGHT_REGION = 296, + CMD0_OPCODE_NPU_SET_SCALE_REGION = 297, + CMD0_OPCODE_NPU_SET_RESIZE_X_SCALE_N_M1 = 298, + CMD0_OPCODE_NPU_SET_RESIZE_Y_SCALE_N_M1 = 299, + CMD0_OPCODE_NPU_SET_RESIZE_X_OFFSET = 300, + CMD0_OPCODE_NPU_SET_RESIZE_Y_OFFSET = 301, + CMD0_OPCODE_NPU_SET_WEIGHT_FORMAT = 302, + CMD0_OPCODE_NPU_SET_BLOCKDEP = 303, + CMD0_OPCODE_NPU_SET_DMA0_SRC_REGION = 304, + CMD0_OPCODE_NPU_SET_DMA0_DST_REGION = 305, + CMD0_OPCODE_NPU_SET_DMA0_SIZE0 = 306, + CMD0_OPCODE_NPU_SET_DMA0_SIZE1 = 307, + CMD0_OPCODE_NPU_SET_DMA0_IDX_REGION = 308, + CMD0_OPCODE_NPU_SET_IFM2_BROADCAST = 384, + CMD0_OPCODE_NPU_SET_IFM2_PRECISION = 389, + CMD0_OPCODE_NPU_SET_IFM2_ZERO_POINT = 393, + CMD0_OPCODE_NPU_SET_IFM2_WIDTH0_M1 = 394, + CMD0_OPCODE_NPU_SET_IFM2_HEIGHT0_M1 = 395, + CMD0_OPCODE_NPU_SET_IFM2_HEIGHT1_M1 = 396, + CMD0_OPCODE_NPU_SET_IFM2_REGION = 399, +}; + +enum cmd1_opcode +{ + CMD1_OPCODE_NPU_SET_IFM_BASE0 = 0, + CMD1_OPCODE_NPU_SET_IFM_BASE1 = 1, + CMD1_OPCODE_NPU_SET_IFM_BASE2 = 2, + CMD1_OPCODE_NPU_SET_IFM_BASE3 = 3, + CMD1_OPCODE_NPU_SET_IFM_STRIDE_X = 4, + CMD1_OPCODE_NPU_SET_IFM_STRIDE_Y = 5, + CMD1_OPCODE_NPU_SET_IFM_STRIDE_C = 6, + CMD1_OPCODE_NPU_SET_OFM_BASE0 = 16, + CMD1_OPCODE_NPU_SET_OFM_BASE1 = 17, + CMD1_OPCODE_NPU_SET_OFM_BASE2 = 18, + CMD1_OPCODE_NPU_SET_OFM_BASE3 = 19, + CMD1_OPCODE_NPU_SET_OFM_STRIDE_X = 20, + CMD1_OPCODE_NPU_SET_OFM_STRIDE_Y = 21, + CMD1_OPCODE_NPU_SET_OFM_STRIDE_C = 22, + CMD1_OPCODE_NPU_SET_WEIGHT_BASE = 32, + CMD1_OPCODE_NPU_SET_WEIGHT_LENGTH = 33, + CMD1_OPCODE_NPU_SET_SCALE_BASE = 34, + CMD1_OPCODE_NPU_SET_SCALE_LENGTH = 35, + CMD1_OPCODE_NPU_SET_OFM_SCALE = 36, + CMD1_OPCODE_NPU_SET_IFM_SCALE = 37, + CMD1_OPCODE_NPU_SET_IFM2_SCALE = 38, + CMD1_OPCODE_NPU_SET_OP_SCALAR = 39, + CMD1_OPCODE_NPU_SET_DMA0_SRC = 48, + CMD1_OPCODE_NPU_SET_DMA0_DST = 49, + CMD1_OPCODE_NPU_SET_DMA0_LEN = 50, + CMD1_OPCODE_NPU_SET_DMA0_SRC_STRIDE0 = 51, + CMD1_OPCODE_NPU_SET_DMA0_SRC_STRIDE1 = 52, + CMD1_OPCODE_NPU_SET_DMA0_DST_STRIDE0 = 53, + CMD1_OPCODE_NPU_SET_DMA0_DST_STRIDE1 = 54, + CMD1_OPCODE_NPU_SET_DMA0_IDX = 55, + CMD1_OPCODE_NPU_SET_DMA0_IDX_MAX = 56, + CMD1_OPCODE_NPU_SET_DMA0_IDX_SKIP1 = 57, + CMD1_OPCODE_NPU_SET_IFM2_BASE0 = 128, + CMD1_OPCODE_NPU_SET_IFM2_BASE1 = 129, + CMD1_OPCODE_NPU_SET_IFM2_BASE2 = 130, + CMD1_OPCODE_NPU_SET_IFM2_BASE3 = 131, + CMD1_OPCODE_NPU_SET_IFM2_STRIDE_X = 132, + CMD1_OPCODE_NPU_SET_IFM2_STRIDE_Y = 133, + CMD1_OPCODE_NPU_SET_IFM2_STRIDE_C = 134, + CMD1_OPCODE_NPU_SET_WEIGHT1_BASE = 144, + CMD1_OPCODE_NPU_SET_WEIGHT1_LENGTH = 145, + CMD1_OPCODE_NPU_SET_WEIGHT2_BASE = 146, + CMD1_OPCODE_NPU_SET_WEIGHT2_LENGTH = 147, + CMD1_OPCODE_NPU_SET_WEIGHT3_BASE = 148, + CMD1_OPCODE_NPU_SET_WEIGHT3_LENGTH = 149, + CMD1_OPCODE_NPU_SET_RESIZE_X = 150, + CMD1_OPCODE_NPU_SET_RESIZE_Y = 151, + CMD1_OPCODE_NPU_OP_BRANCH = 256, +}; + +enum cmd_ctrl +{ + CMD_CTRL_CMD0_CTRL = 0, + CMD_CTRL_CMD1_CTRL = 1, +}; + +enum custom_dma +{ + CUSTOM_DMA_NOT_IMPLEMENTED = 0, + CUSTOM_DMA_IMPLEMENTED = 1, +}; + +enum dma_fault_channel +{ + DMA_FAULT_CHANNEL_CMD_READ = 0, + DMA_FAULT_CHANNEL_IFM_READ = 1, + DMA_FAULT_CHANNEL_WEIGHT_READ = 2, + DMA_FAULT_CHANNEL_SBS_READ = 3, + DMA_FAULT_CHANNEL_MEM2MEM_READ = 4, + DMA_FAULT_CHANNEL_OFM_WRITE = 8, + DMA_FAULT_CHANNEL_MEM2MEM_WRITE = 9, +}; + +enum dma_fault_src +{ + DMA_FAULT_SRC_SRAM = 0, + DMA_FAULT_SRC_EXT = 1, +}; + +enum dma_idx_mode +{ + DMA_IDX_MODE_DISABLED = 0, + DMA_IDX_MODE_ENABLED = 1, +}; + +enum dma_region_mode +{ + DMA_REGION_MODE_EXTERNAL = 0, + DMA_REGION_MODE_INTERNAL = 1, +}; + +enum dma_stride_mode +{ + DMA_STRIDE_MODE_D1 = 0, + DMA_STRIDE_MODE_D2 = 1, + DMA_STRIDE_MODE_D3 = 2, +}; + +enum elementwise_mode +{ + ELEMENTWISE_MODE_MUL = 0, + ELEMENTWISE_MODE_ADD = 1, + ELEMENTWISE_MODE_SUB = 2, + ELEMENTWISE_MODE_MIN = 3, + ELEMENTWISE_MODE_MAX = 4, + ELEMENTWISE_MODE_LRELU = 5, + ELEMENTWISE_MODE_ABS = 6, + ELEMENTWISE_MODE_CLZ = 7, + ELEMENTWISE_MODE_SHR = 8, + ELEMENTWISE_MODE_SHL = 9, + ELEMENTWISE_MODE_LSR = 10, + ELEMENTWISE_MODE_DIV = 11, + ELEMENTWISE_MODE_CMP_EQ = 16, + ELEMENTWISE_MODE_CMP_NE = 17, + ELEMENTWISE_MODE_CMP_GE = 18, + ELEMENTWISE_MODE_CMP_GT = 19, + ELEMENTWISE_MODE_AND = 33, + ELEMENTWISE_MODE_OR = 34, + ELEMENTWISE_MODE_XOR = 35, + ELEMENTWISE_MODE_NOT = 36, + ELEMENTWISE_MODE_AND_NOT = 42, +}; + +enum ifm_upscale_mode +{ + IFM_UPSCALE_MODE_NONE = 0, + IFM_UPSCALE_MODE_NEAREST = 1, + IFM_UPSCALE_MODE_ZEROS = 2, +}; + +enum kernel_decomposition +{ + KERNEL_DECOMPOSITION_D8X8 = 0, + KERNEL_DECOMPOSITION_D4X4 = 1, +}; + +enum kernel_dilation +{ + KERNEL_DILATION_NONE = 0, + KERNEL_DILATION_X2 = 1, +}; + +enum max_beats +{ + MAX_BEATS_B64 = 0, + MAX_BEATS_B128 = 1, + MAX_BEATS_B256 = 2, +}; + +enum microblock +{ + MICROBLOCK_U1X1 = 0, + MICROBLOCK_U1X2 = 1, + MICROBLOCK_U1X4 = 2, + MICROBLOCK_U2X2 = 3, + MICROBLOCK_U2X4 = 4, + MICROBLOCK_U4X4 = 5, +}; + +enum ofm_scale_mode +{ + OFM_SCALE_MODE_PER_CHANNEL = 0, + OFM_SCALE_MODE_GLOBAL = 1, +}; + +enum pmu_axi_channel +{ + PMU_AXI_CHANNEL_RD_CMD = 0, + PMU_AXI_CHANNEL_RD_IFM = 1, + PMU_AXI_CHANNEL_RD_WEIGHTS = 2, + PMU_AXI_CHANNEL_RD_SCALE_BIAS = 3, + PMU_AXI_CHANNEL_RD_MEM2MEM = 4, + PMU_AXI_CHANNEL_RD_IFM_STREAM = 5, + PMU_AXI_CHANNEL_RD_MEM2MEM_IDX = 6, + PMU_AXI_CHANNEL_WR_OFM = 8, + PMU_AXI_CHANNEL_WR_MEM2MEM = 9, +}; + +enum pmu_event +{ + PMU_EVENT_NO_EVENT = 0, + PMU_EVENT_CYCLE = 17, + PMU_EVENT_NPU_IDLE = 32, + PMU_EVENT_CC_STALLED_ON_BLOCKDEP = 33, + PMU_EVENT_CC_STALLED_ON_SHRAM_RECONFIG = 34, + PMU_EVENT_NPU_ACTIVE = 35, + PMU_EVENT_MAC_ACTIVE = 48, + PMU_EVENT_MAC_DPU_ACTIVE = 51, + PMU_EVENT_MAC_STALLED_BY_W_OR_ACC = 52, + PMU_EVENT_MAC_STALLED_BY_W = 53, + PMU_EVENT_MAC_STALLED_BY_ACC = 54, + PMU_EVENT_MAC_STALLED_BY_IB = 55, + PMU_EVENT_MAC_STALLED_BY_INT_W = 57, + PMU_EVENT_MAC_STALLED_BY_INT_ACC = 58, + PMU_EVENT_AO_ACTIVE = 64, + PMU_EVENT_AO_STALLED_BY_BS_OR_OB = 67, + PMU_EVENT_AO_STALLED_BY_BS = 68, + PMU_EVENT_AO_STALLED_BY_OB = 69, + PMU_EVENT_AO_STALLED_BY_AB_OR_CB = 70, + PMU_EVENT_AO_STALLED_BY_AB = 71, + PMU_EVENT_AO_STALLED_BY_CB = 72, + PMU_EVENT_WD_ACTIVE = 80, + PMU_EVENT_WD_STALLED = 81, + PMU_EVENT_WD_STALLED_BY_WD_BUF = 83, + PMU_EVENT_WD_STALLED_BY_WS_FC = 84, + PMU_EVENT_WD_STALLED_BY_WS_TC = 85, + PMU_EVENT_WD_TRANS_WBLK = 89, + PMU_EVENT_WD_TRANS_WS_FC = 90, + PMU_EVENT_WD_TRANS_WS_TC = 91, + PMU_EVENT_WD_STALLED_BY_WS_SC0 = 96, + PMU_EVENT_WD_STALLED_BY_WS_SC1 = 97, + PMU_EVENT_WD_STALLED_BY_WS_SC2 = 98, + PMU_EVENT_WD_STALLED_BY_WS_SC3 = 99, + PMU_EVENT_WD_PARSE_ACTIVE_SC0 = 100, + PMU_EVENT_WD_PARSE_ACTIVE_SC1 = 101, + PMU_EVENT_WD_PARSE_ACTIVE_SC2 = 102, + PMU_EVENT_WD_PARSE_ACTIVE_SC3 = 103, + PMU_EVENT_WD_PARSE_STALL_SC0 = 104, + PMU_EVENT_WD_PARSE_STALL_SC1 = 105, + PMU_EVENT_WD_PARSE_STALL_SC2 = 106, + PMU_EVENT_WD_PARSE_STALL_SC3 = 107, + PMU_EVENT_WD_PARSE_STALL_IN_SC0 = 108, + PMU_EVENT_WD_PARSE_STALL_IN_SC1 = 109, + PMU_EVENT_WD_PARSE_STALL_IN_SC2 = 110, + PMU_EVENT_WD_PARSE_STALL_IN_SC3 = 111, + PMU_EVENT_WD_PARSE_STALL_OUT_SC0 = 112, + PMU_EVENT_WD_PARSE_STALL_OUT_SC1 = 113, + PMU_EVENT_WD_PARSE_STALL_OUT_SC2 = 114, + PMU_EVENT_WD_PARSE_STALL_OUT_SC3 = 115, + PMU_EVENT_WD_TRANS_WS_SC0 = 116, + PMU_EVENT_WD_TRANS_WS_SC1 = 117, + PMU_EVENT_WD_TRANS_WS_SC2 = 118, + PMU_EVENT_WD_TRANS_WS_SC3 = 119, + PMU_EVENT_WD_TRANS_WB0 = 120, + PMU_EVENT_WD_TRANS_WB1 = 121, + PMU_EVENT_WD_TRANS_WB2 = 122, + PMU_EVENT_WD_TRANS_WB3 = 123, + PMU_EVENT_SRAM_RD_TRANS_ACCEPTED = 128, + PMU_EVENT_SRAM_RD_TRANS_COMPLETED = 129, + PMU_EVENT_SRAM_RD_DATA_BEAT_RECEIVED = 130, + PMU_EVENT_SRAM_RD_TRAN_REQ_STALLED = 131, + PMU_EVENT_SRAM_WR_TRANS_ACCEPTED = 132, + PMU_EVENT_SRAM_WR_TRANS_COMPLETED_M = 133, + PMU_EVENT_SRAM_WR_TRANS_COMPLETED_S = 134, + PMU_EVENT_SRAM_WR_DATA_BEAT_WRITTEN = 135, + PMU_EVENT_SRAM_WR_TRAN_REQ_STALLED = 136, + PMU_EVENT_SRAM_WR_DATA_BEAT_STALLED = 137, + PMU_EVENT_SRAM_ENABLED_CYCLES = 140, + PMU_EVENT_SRAM_RD_STALL_LIMIT = 142, + PMU_EVENT_SRAM_WR_STALL_LIMIT = 143, + PMU_EVENT_AXI_LATENCY_ANY = 160, + PMU_EVENT_AXI_LATENCY_32 = 161, + PMU_EVENT_AXI_LATENCY_64 = 162, + PMU_EVENT_AXI_LATENCY_128 = 163, + PMU_EVENT_AXI_LATENCY_256 = 164, + PMU_EVENT_AXI_LATENCY_512 = 165, + PMU_EVENT_AXI_LATENCY_1024 = 166, + PMU_EVENT_ECC_DMA = 176, + PMU_EVENT_ECC_MAC_IB = 177, + PMU_EVENT_ECC_MAC_AB = 178, + PMU_EVENT_ECC_AO_CB = 179, + PMU_EVENT_ECC_AO_OB = 180, + PMU_EVENT_ECC_AO_LUT = 181, + PMU_EVENT_EXT_RD_TRANS_ACCEPTED = 384, + PMU_EVENT_EXT_RD_TRANS_COMPLETED = 385, + PMU_EVENT_EXT_RD_DATA_BEAT_RECEIVED = 386, + PMU_EVENT_EXT_RD_TRAN_REQ_STALLED = 387, + PMU_EVENT_EXT_WR_TRANS_ACCEPTED = 388, + PMU_EVENT_EXT_WR_TRANS_COMPLETED_M = 389, + PMU_EVENT_EXT_WR_TRANS_COMPLETED_S = 390, + PMU_EVENT_EXT_WR_DATA_BEAT_WRITTEN = 391, + PMU_EVENT_EXT_WR_TRAN_REQ_STALLED = 392, + PMU_EVENT_EXT_WR_DATA_BEAT_STALLED = 393, + PMU_EVENT_EXT_ENABLED_CYCLES = 396, + PMU_EVENT_EXT_RD_STALL_LIMIT = 398, + PMU_EVENT_EXT_WR_STALL_LIMIT = 399, +}; + +enum pmu_port_disable +{ + PMU_PORT_DISABLE_ENABLE = 0, + PMU_PORT_DISABLE_DISABLE = 1, +}; + +enum pooling_mode +{ + POOLING_MODE_MAX = 0, + POOLING_MODE_AVERAGE = 1, + POOLING_MODE_REDUCE_SUM = 2, + POOLING_MODE_SUM = 3, + POOLING_MODE_NONE = 4, + POOLING_MODE_MIN = 5, + POOLING_MODE_ARGMAX_X = 6, + POOLING_MODE_ARGMAX_Y = 7, +}; + +enum privilege_level +{ + PRIVILEGE_LEVEL_USER = 0, + PRIVILEGE_LEVEL_PRIVILEGED = 1, +}; + +enum ram_id +{ + RAM_ID_LUT = 0, + RAM_ID_IB = 1, + RAM_ID_AB = 2, + RAM_ID_CB = 3, + RAM_ID_OB = 4, +}; + +enum resize_mode +{ + RESIZE_MODE_BILINEAR = 0, + RESIZE_MODE_REPLICATE = 1, + RESIZE_MODE_NEAREST = 2, +}; + +enum round_mode_ifm +{ + ROUND_MODE_IFM_DOUBLE_SYMMETRIC = 0, + ROUND_MODE_IFM_NATURAL = 1, +}; + +enum round_mode_ofm +{ + ROUND_MODE_OFM_DOUBLE_SYMMETRIC = 0, + ROUND_MODE_OFM_NATURAL = 1, + ROUND_MODE_OFM_DOUBLE_ASYMMETRIC = 2, + ROUND_MODE_OFM_SYMMETRIC = 3, + ROUND_MODE_OFM_TRUNCATE_TO_ZERO = 4, + ROUND_MODE_OFM_TRUNCATE_TO_LOWER = 5, +}; + +enum security_level +{ + SECURITY_LEVEL_SECURE = 0, + SECURITY_LEVEL_NON_SECURE = 1, +}; + +enum state +{ + STATE_STOPPED = 0, + STATE_RUNNING = 1, +}; + +enum wd_active_core +{ + WD_ACTIVE_CORE_NONE = 0, + WD_ACTIVE_CORE_STANDARD = 1, + WD_ACTIVE_CORE_FAST = 2, + WD_ACTIVE_CORE_TENSOR = 3, +}; + +enum weight_format +{ + WEIGHT_FORMAT_SWD = 0, + WEIGHT_FORMAT_FWD = 1, +}; + +enum weight_order +{ + WEIGHT_ORDER_DEPTH_FIRST = 0, + WEIGHT_ORDER_PART_KERNEL_FIRST = 1, +}; + +enum weight_sparsity +{ + WEIGHT_SPARSITY_NONE = 0, + WEIGHT_SPARSITY_SPARSE_2_4 = 1, +}; + +#endif + +#ifdef NPU_DISASSEMBLE + +static const char* acc_format_str[] = +{ + "ACC_FORMAT_I32", + "ACC_FORMAT_I48", +}; + +static const char* acc_input_str[] = +{ + "ACC_INPUT_RESET", + "ACC_INPUT_KEEP", + "ACC_INPUT_IFM2", +}; + +static const char* acc_output_str[] = +{ + "ACC_OUTPUT_ENABLE", + "ACC_OUTPUT_DISABLE", +}; + +static const char* activation_clip_range_str[] = +{ + "ACTIVATION_CLIP_RANGE_B16", + "ACTIVATION_CLIP_RANGE_NONE", +}; + +static const char* activation_format_str[] = +{ + "ACTIVATION_FORMAT_NHWC", + "ACTIVATION_FORMAT_NHCWB16", +}; + +static const char* activation_function_str[] = +{ + "ACTIVATION_FUNCTION_LUT_NONE", + "ACTIVATION_FUNCTION_LUT_U8_U8", + "****", + "****", + "ACTIVATION_FUNCTION_LUT_S8_S8", + "ACTIVATION_FUNCTION_LUT_S8_S16", + "****", + "ACTIVATION_FUNCTION_LUT_S8_S32", + "ACTIVATION_FUNCTION_LUT_S16_S16", + "ACTIVATION_FUNCTION_LUT_S16_S32", + "ACTIVATION_FUNCTION_LUT_TANH", + "ACTIVATION_FUNCTION_LUT_SIGMOID", +}; + +static const char* activation_precision_str[] = +{ + "ACTIVATION_PRECISION_B8", + "ACTIVATION_PRECISION_B16", + "ACTIVATION_PRECISION_B32", + "ACTIVATION_PRECISION_B64", +}; + +static const char* activation_reverse_str[] = +{ + "ACTIVATION_REVERSE_NONE", + "ACTIVATION_REVERSE_H", + "ACTIVATION_REVERSE_W", + "ACTIVATION_REVERSE_C", +}; + +static const char* activation_storage_str[] = +{ + "ACTIVATION_STORAGE_TILE2X2", + "ACTIVATION_STORAGE_TILE3X1", + "ACTIVATION_STORAGE_CHAINED", + "ACTIVATION_STORAGE_NONE", +}; + +static const char* activation_transpose_str[] = +{ + "ACTIVATION_TRANSPOSE_HWC", + "ACTIVATION_TRANSPOSE_WHC", + "ACTIVATION_TRANSPOSE_HCW", + "ACTIVATION_TRANSPOSE_WCH", + "****", + "****", + "ACTIVATION_TRANSPOSE_CHW", + "ACTIVATION_TRANSPOSE_CWH", +}; + +static const char* activation_type_str[] = +{ + "ACTIVATION_TYPE_UNSIGNED", + "ACTIVATION_TYPE_SIGNED", +}; + +static const char* axi_mem_domain_str[] = +{ + "AXI_MEM_DOMAIN_NON_SHARABLE", + "AXI_MEM_DOMAIN_INNER_SHARABLE", + "AXI_MEM_DOMAIN_OUTER_SHARABLE", + "AXI_MEM_DOMAIN_SYSTEM", +}; + +static const char* axi_mem_encoding_str[] = +{ + "AXI_MEM_ENCODING_DEVICE_NON_BUFFERABLE", + "AXI_MEM_ENCODING_DEVICE_BUFFERABLE", + "AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_NON_BUFFERABLE", + "AXI_MEM_ENCODING_NORMAL_NON_CACHEABLE_BUFFERABLE", + "AXI_MEM_ENCODING_WRITE_THROUGH_NO_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_THROUGH_READ_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_THROUGH_WRITE_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_THROUGH_READ_AND_WRITE_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_BACK_NO_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_BACK_READ_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_BACK_WRITE_ALLOCATE", + "AXI_MEM_ENCODING_WRITE_BACK_READ_AND_WRITE_ALLOCATE", +}; + +static const char* axi_port_str[] = +{ + "AXI_PORT_SRAM", + "AXI_PORT_EXT", +}; + +static const char* branch_cond_str[] = +{ + "BRANCH_COND_ALWAYS", + "BRANCH_COND_RF_TRUE", +}; + +static const char* broadcast_mode_str[] = +{ + "BROADCAST_MODE_NONE", + "BROADCAST_MODE_H", + "BROADCAST_MODE_W", + "BROADCAST_MODE_HW", + "BROADCAST_MODE_C", + "BROADCAST_MODE_CH", + "BROADCAST_MODE_CW", + "BROADCAST_MODE_CWH", + "BROADCAST_MODE_SCALAR", +}; + +static const char* cmd0_opcode_str[] = +{ + "CMD0_OPCODE_NPU_OP_STOP", + "CMD0_OPCODE_NPU_OP_IRQ", + "CMD0_OPCODE_NPU_OP_CONV", + "CMD0_OPCODE_NPU_OP_DEPTHWISE", + "****", + "CMD0_OPCODE_NPU_OP_POOL", + "CMD0_OPCODE_NPU_OP_ELEMENTWISE", + "CMD0_OPCODE_NPU_OP_RESIZE", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_OP_DMA_START", + "CMD0_OPCODE_NPU_OP_DMA_WAIT", + "CMD0_OPCODE_NPU_OP_KERNEL_WAIT", + "CMD0_OPCODE_NPU_OP_PMU_MASK", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_SET_IFM_PAD_TOP", + "CMD0_OPCODE_NPU_SET_IFM_PAD_LEFT", + "CMD0_OPCODE_NPU_SET_IFM_PAD_RIGHT", + "CMD0_OPCODE_NPU_SET_IFM_PAD_BOTTOM", + "CMD0_OPCODE_NPU_SET_IFM_DEPTH_M1", + "CMD0_OPCODE_NPU_SET_IFM_PRECISION", + "****", + "CMD0_OPCODE_NPU_SET_IFM_UPSCALE", + "CMD0_OPCODE_NPU_SET_IFM_BROADCAST", + "CMD0_OPCODE_NPU_SET_IFM_ZERO_POINT", + "CMD0_OPCODE_NPU_SET_IFM_WIDTH0_M1", + "CMD0_OPCODE_NPU_SET_IFM_HEIGHT0_M1", + "CMD0_OPCODE_NPU_SET_IFM_HEIGHT1_M1", + "****", + "****", + "CMD0_OPCODE_NPU_SET_IFM_REGION", + "****", + "CMD0_OPCODE_NPU_SET_OFM_WIDTH_M1", + "CMD0_OPCODE_NPU_SET_OFM_HEIGHT_M1", + "CMD0_OPCODE_NPU_SET_OFM_DEPTH_M1", + "CMD0_OPCODE_NPU_SET_OFM_PRECISION", + "CMD0_OPCODE_NPU_SET_OFM_BLK_WIDTH_M1", + "CMD0_OPCODE_NPU_SET_OFM_BLK_HEIGHT_M1", + "CMD0_OPCODE_NPU_SET_OFM_BLK_DEPTH_M1", + "CMD0_OPCODE_NPU_SET_OFM_ZERO_POINT", + "****", + "CMD0_OPCODE_NPU_SET_OFM_WIDTH0_M1", + "CMD0_OPCODE_NPU_SET_OFM_HEIGHT0_M1", + "CMD0_OPCODE_NPU_SET_OFM_HEIGHT1_M1", + "****", + "****", + "CMD0_OPCODE_NPU_SET_OFM_REGION", + "CMD0_OPCODE_NPU_SET_KERNEL_WIDTH_M1", + "CMD0_OPCODE_NPU_SET_KERNEL_HEIGHT_M1", + "CMD0_OPCODE_NPU_SET_KERNEL_STRIDE", + "****", + "CMD0_OPCODE_NPU_SET_ACC_FORMAT", + "CMD0_OPCODE_NPU_SET_ACTIVATION", + "CMD0_OPCODE_NPU_SET_ACTIVATION_MIN", + "CMD0_OPCODE_NPU_SET_ACTIVATION_MAX", + "CMD0_OPCODE_NPU_SET_WEIGHT_REGION", + "CMD0_OPCODE_NPU_SET_SCALE_REGION", + "CMD0_OPCODE_NPU_SET_RESIZE_X_SCALE_N_M1", + "CMD0_OPCODE_NPU_SET_RESIZE_Y_SCALE_N_M1", + "CMD0_OPCODE_NPU_SET_RESIZE_X_OFFSET", + "CMD0_OPCODE_NPU_SET_RESIZE_Y_OFFSET", + "CMD0_OPCODE_NPU_SET_WEIGHT_FORMAT", + "CMD0_OPCODE_NPU_SET_BLOCKDEP", + "CMD0_OPCODE_NPU_SET_DMA0_SRC_REGION", + "CMD0_OPCODE_NPU_SET_DMA0_DST_REGION", + "CMD0_OPCODE_NPU_SET_DMA0_SIZE0", + "CMD0_OPCODE_NPU_SET_DMA0_SIZE1", + "CMD0_OPCODE_NPU_SET_DMA0_IDX_REGION", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_SET_IFM2_BROADCAST", + "****", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_SET_IFM2_PRECISION", + "****", + "****", + "****", + "CMD0_OPCODE_NPU_SET_IFM2_ZERO_POINT", + "CMD0_OPCODE_NPU_SET_IFM2_WIDTH0_M1", + "CMD0_OPCODE_NPU_SET_IFM2_HEIGHT0_M1", + "CMD0_OPCODE_NPU_SET_IFM2_HEIGHT1_M1", + "****", + "****", + "CMD0_OPCODE_NPU_SET_IFM2_REGION", +}; + +static const char* cmd1_opcode_str[] = +{ + "CMD1_OPCODE_NPU_SET_IFM_BASE0", + "CMD1_OPCODE_NPU_SET_IFM_BASE1", + "CMD1_OPCODE_NPU_SET_IFM_BASE2", + "CMD1_OPCODE_NPU_SET_IFM_BASE3", + "CMD1_OPCODE_NPU_SET_IFM_STRIDE_X", + "CMD1_OPCODE_NPU_SET_IFM_STRIDE_Y", + "CMD1_OPCODE_NPU_SET_IFM_STRIDE_C", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_SET_OFM_BASE0", + "CMD1_OPCODE_NPU_SET_OFM_BASE1", + "CMD1_OPCODE_NPU_SET_OFM_BASE2", + "CMD1_OPCODE_NPU_SET_OFM_BASE3", + "CMD1_OPCODE_NPU_SET_OFM_STRIDE_X", + "CMD1_OPCODE_NPU_SET_OFM_STRIDE_Y", + "CMD1_OPCODE_NPU_SET_OFM_STRIDE_C", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_SET_WEIGHT_BASE", + "CMD1_OPCODE_NPU_SET_WEIGHT_LENGTH", + "CMD1_OPCODE_NPU_SET_SCALE_BASE", + "CMD1_OPCODE_NPU_SET_SCALE_LENGTH", + "CMD1_OPCODE_NPU_SET_OFM_SCALE", + "CMD1_OPCODE_NPU_SET_IFM_SCALE", + "CMD1_OPCODE_NPU_SET_IFM2_SCALE", + "CMD1_OPCODE_NPU_SET_OP_SCALAR", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_SET_DMA0_SRC", + "CMD1_OPCODE_NPU_SET_DMA0_DST", + "CMD1_OPCODE_NPU_SET_DMA0_LEN", + "CMD1_OPCODE_NPU_SET_DMA0_SRC_STRIDE0", + "CMD1_OPCODE_NPU_SET_DMA0_SRC_STRIDE1", + "CMD1_OPCODE_NPU_SET_DMA0_DST_STRIDE0", + "CMD1_OPCODE_NPU_SET_DMA0_DST_STRIDE1", + "CMD1_OPCODE_NPU_SET_DMA0_IDX", + "CMD1_OPCODE_NPU_SET_DMA0_IDX_MAX", + "CMD1_OPCODE_NPU_SET_DMA0_IDX_SKIP1", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_SET_IFM2_BASE0", + "CMD1_OPCODE_NPU_SET_IFM2_BASE1", + "CMD1_OPCODE_NPU_SET_IFM2_BASE2", + "CMD1_OPCODE_NPU_SET_IFM2_BASE3", + "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_X", + "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_Y", + "CMD1_OPCODE_NPU_SET_IFM2_STRIDE_C", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_SET_WEIGHT1_BASE", + "CMD1_OPCODE_NPU_SET_WEIGHT1_LENGTH", + "CMD1_OPCODE_NPU_SET_WEIGHT2_BASE", + "CMD1_OPCODE_NPU_SET_WEIGHT2_LENGTH", + "CMD1_OPCODE_NPU_SET_WEIGHT3_BASE", + "CMD1_OPCODE_NPU_SET_WEIGHT3_LENGTH", + "CMD1_OPCODE_NPU_SET_RESIZE_X", + "CMD1_OPCODE_NPU_SET_RESIZE_Y", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "CMD1_OPCODE_NPU_OP_BRANCH", +}; + +static const char* cmd_ctrl_str[] = +{ + "CMD_CTRL_CMD0_CTRL", + "CMD_CTRL_CMD1_CTRL", +}; + +static const char* custom_dma_str[] = +{ + "CUSTOM_DMA_NOT_IMPLEMENTED", + "CUSTOM_DMA_IMPLEMENTED", +}; + +static const char* dma_fault_channel_str[] = +{ + "DMA_FAULT_CHANNEL_CMD_READ", + "DMA_FAULT_CHANNEL_IFM_READ", + "DMA_FAULT_CHANNEL_WEIGHT_READ", + "DMA_FAULT_CHANNEL_SBS_READ", + "DMA_FAULT_CHANNEL_MEM2MEM_READ", + "****", + "****", + "****", + "DMA_FAULT_CHANNEL_OFM_WRITE", + "DMA_FAULT_CHANNEL_MEM2MEM_WRITE", +}; + +static const char* dma_fault_src_str[] = +{ + "DMA_FAULT_SRC_SRAM", + "DMA_FAULT_SRC_EXT", +}; + +static const char* dma_idx_mode_str[] = +{ + "DMA_IDX_MODE_DISABLED", + "DMA_IDX_MODE_ENABLED", +}; + +static const char* dma_region_mode_str[] = +{ + "DMA_REGION_MODE_EXTERNAL", + "DMA_REGION_MODE_INTERNAL", +}; + +static const char* dma_stride_mode_str[] = +{ + "DMA_STRIDE_MODE_D1", + "DMA_STRIDE_MODE_D2", + "DMA_STRIDE_MODE_D3", +}; + +static const char* elementwise_mode_str[] = +{ + "ELEMENTWISE_MODE_MUL", + "ELEMENTWISE_MODE_ADD", + "ELEMENTWISE_MODE_SUB", + "ELEMENTWISE_MODE_MIN", + "ELEMENTWISE_MODE_MAX", + "ELEMENTWISE_MODE_LRELU", + "ELEMENTWISE_MODE_ABS", + "ELEMENTWISE_MODE_CLZ", + "ELEMENTWISE_MODE_SHR", + "ELEMENTWISE_MODE_SHL", + "ELEMENTWISE_MODE_LSR", + "ELEMENTWISE_MODE_DIV", + "****", + "****", + "****", + "****", + "ELEMENTWISE_MODE_CMP_EQ", + "ELEMENTWISE_MODE_CMP_NE", + "ELEMENTWISE_MODE_CMP_GE", + "ELEMENTWISE_MODE_CMP_GT", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "ELEMENTWISE_MODE_AND", + "ELEMENTWISE_MODE_OR", + "ELEMENTWISE_MODE_XOR", + "ELEMENTWISE_MODE_NOT", + "****", + "****", + "****", + "****", + "****", + "ELEMENTWISE_MODE_AND_NOT", +}; + +static const char* ifm_upscale_mode_str[] = +{ + "IFM_UPSCALE_MODE_NONE", + "IFM_UPSCALE_MODE_NEAREST", + "IFM_UPSCALE_MODE_ZEROS", +}; + +static const char* kernel_decomposition_str[] = +{ + "KERNEL_DECOMPOSITION_D8X8", + "KERNEL_DECOMPOSITION_D4X4", +}; + +static const char* kernel_dilation_str[] = +{ + "KERNEL_DILATION_NONE", + "KERNEL_DILATION_X2", +}; + +static const char* max_beats_str[] = +{ + "MAX_BEATS_B64", + "MAX_BEATS_B128", + "MAX_BEATS_B256", +}; + +static const char* microblock_str[] = +{ + "MICROBLOCK_U1X1", + "MICROBLOCK_U1X2", + "MICROBLOCK_U1X4", + "MICROBLOCK_U2X2", + "MICROBLOCK_U2X4", + "MICROBLOCK_U4X4", +}; + +static const char* ofm_scale_mode_str[] = +{ + "OFM_SCALE_MODE_PER_CHANNEL", + "OFM_SCALE_MODE_GLOBAL", +}; + +static const char* pmu_axi_channel_str[] = +{ + "PMU_AXI_CHANNEL_RD_CMD", + "PMU_AXI_CHANNEL_RD_IFM", + "PMU_AXI_CHANNEL_RD_WEIGHTS", + "PMU_AXI_CHANNEL_RD_SCALE_BIAS", + "PMU_AXI_CHANNEL_RD_MEM2MEM", + "PMU_AXI_CHANNEL_RD_IFM_STREAM", + "PMU_AXI_CHANNEL_RD_MEM2MEM_IDX", + "****", + "PMU_AXI_CHANNEL_WR_OFM", + "PMU_AXI_CHANNEL_WR_MEM2MEM", +}; + +static const char* pmu_event_str[] = +{ + "PMU_EVENT_NO_EVENT", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_CYCLE", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_NPU_IDLE", + "PMU_EVENT_CC_STALLED_ON_BLOCKDEP", + "PMU_EVENT_CC_STALLED_ON_SHRAM_RECONFIG", + "PMU_EVENT_NPU_ACTIVE", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_MAC_ACTIVE", + "****", + "****", + "PMU_EVENT_MAC_DPU_ACTIVE", + "PMU_EVENT_MAC_STALLED_BY_W_OR_ACC", + "PMU_EVENT_MAC_STALLED_BY_W", + "PMU_EVENT_MAC_STALLED_BY_ACC", + "PMU_EVENT_MAC_STALLED_BY_IB", + "****", + "PMU_EVENT_MAC_STALLED_BY_INT_W", + "PMU_EVENT_MAC_STALLED_BY_INT_ACC", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_AO_ACTIVE", + "****", + "****", + "PMU_EVENT_AO_STALLED_BY_BS_OR_OB", + "PMU_EVENT_AO_STALLED_BY_BS", + "PMU_EVENT_AO_STALLED_BY_OB", + "PMU_EVENT_AO_STALLED_BY_AB_OR_CB", + "PMU_EVENT_AO_STALLED_BY_AB", + "PMU_EVENT_AO_STALLED_BY_CB", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_WD_ACTIVE", + "PMU_EVENT_WD_STALLED", + "****", + "PMU_EVENT_WD_STALLED_BY_WD_BUF", + "PMU_EVENT_WD_STALLED_BY_WS_FC", + "PMU_EVENT_WD_STALLED_BY_WS_TC", + "****", + "****", + "****", + "PMU_EVENT_WD_TRANS_WBLK", + "PMU_EVENT_WD_TRANS_WS_FC", + "PMU_EVENT_WD_TRANS_WS_TC", + "****", + "****", + "****", + "****", + "PMU_EVENT_WD_STALLED_BY_WS_SC0", + "PMU_EVENT_WD_STALLED_BY_WS_SC1", + "PMU_EVENT_WD_STALLED_BY_WS_SC2", + "PMU_EVENT_WD_STALLED_BY_WS_SC3", + "PMU_EVENT_WD_PARSE_ACTIVE_SC0", + "PMU_EVENT_WD_PARSE_ACTIVE_SC1", + "PMU_EVENT_WD_PARSE_ACTIVE_SC2", + "PMU_EVENT_WD_PARSE_ACTIVE_SC3", + "PMU_EVENT_WD_PARSE_STALL_SC0", + "PMU_EVENT_WD_PARSE_STALL_SC1", + "PMU_EVENT_WD_PARSE_STALL_SC2", + "PMU_EVENT_WD_PARSE_STALL_SC3", + "PMU_EVENT_WD_PARSE_STALL_IN_SC0", + "PMU_EVENT_WD_PARSE_STALL_IN_SC1", + "PMU_EVENT_WD_PARSE_STALL_IN_SC2", + "PMU_EVENT_WD_PARSE_STALL_IN_SC3", + "PMU_EVENT_WD_PARSE_STALL_OUT_SC0", + "PMU_EVENT_WD_PARSE_STALL_OUT_SC1", + "PMU_EVENT_WD_PARSE_STALL_OUT_SC2", + "PMU_EVENT_WD_PARSE_STALL_OUT_SC3", + "PMU_EVENT_WD_TRANS_WS_SC0", + "PMU_EVENT_WD_TRANS_WS_SC1", + "PMU_EVENT_WD_TRANS_WS_SC2", + "PMU_EVENT_WD_TRANS_WS_SC3", + "PMU_EVENT_WD_TRANS_WB0", + "PMU_EVENT_WD_TRANS_WB1", + "PMU_EVENT_WD_TRANS_WB2", + "PMU_EVENT_WD_TRANS_WB3", + "****", + "****", + "****", + "****", + "PMU_EVENT_SRAM_RD_TRANS_ACCEPTED", + "PMU_EVENT_SRAM_RD_TRANS_COMPLETED", + "PMU_EVENT_SRAM_RD_DATA_BEAT_RECEIVED", + "PMU_EVENT_SRAM_RD_TRAN_REQ_STALLED", + "PMU_EVENT_SRAM_WR_TRANS_ACCEPTED", + "PMU_EVENT_SRAM_WR_TRANS_COMPLETED_M", + "PMU_EVENT_SRAM_WR_TRANS_COMPLETED_S", + "PMU_EVENT_SRAM_WR_DATA_BEAT_WRITTEN", + "PMU_EVENT_SRAM_WR_TRAN_REQ_STALLED", + "PMU_EVENT_SRAM_WR_DATA_BEAT_STALLED", + "****", + "****", + "PMU_EVENT_SRAM_ENABLED_CYCLES", + "****", + "PMU_EVENT_SRAM_RD_STALL_LIMIT", + "PMU_EVENT_SRAM_WR_STALL_LIMIT", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_AXI_LATENCY_ANY", + "PMU_EVENT_AXI_LATENCY_32", + "PMU_EVENT_AXI_LATENCY_64", + "PMU_EVENT_AXI_LATENCY_128", + "PMU_EVENT_AXI_LATENCY_256", + "PMU_EVENT_AXI_LATENCY_512", + "PMU_EVENT_AXI_LATENCY_1024", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_ECC_DMA", + "PMU_EVENT_ECC_MAC_IB", + "PMU_EVENT_ECC_MAC_AB", + "PMU_EVENT_ECC_AO_CB", + "PMU_EVENT_ECC_AO_OB", + "PMU_EVENT_ECC_AO_LUT", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "****", + "PMU_EVENT_EXT_RD_TRANS_ACCEPTED", + "PMU_EVENT_EXT_RD_TRANS_COMPLETED", + "PMU_EVENT_EXT_RD_DATA_BEAT_RECEIVED", + "PMU_EVENT_EXT_RD_TRAN_REQ_STALLED", + "PMU_EVENT_EXT_WR_TRANS_ACCEPTED", + "PMU_EVENT_EXT_WR_TRANS_COMPLETED_M", + "PMU_EVENT_EXT_WR_TRANS_COMPLETED_S", + "PMU_EVENT_EXT_WR_DATA_BEAT_WRITTEN", + "PMU_EVENT_EXT_WR_TRAN_REQ_STALLED", + "PMU_EVENT_EXT_WR_DATA_BEAT_STALLED", + "****", + "****", + "PMU_EVENT_EXT_ENABLED_CYCLES", + "****", + "PMU_EVENT_EXT_RD_STALL_LIMIT", + "PMU_EVENT_EXT_WR_STALL_LIMIT", +}; + +static const char* pmu_port_disable_str[] = +{ + "PMU_PORT_DISABLE_ENABLE", + "PMU_PORT_DISABLE_DISABLE", +}; + +static const char* pooling_mode_str[] = +{ + "POOLING_MODE_MAX", + "POOLING_MODE_AVERAGE", + "POOLING_MODE_REDUCE_SUM", + "POOLING_MODE_SUM", + "POOLING_MODE_NONE", + "POOLING_MODE_MIN", + "POOLING_MODE_ARGMAX_X", + "POOLING_MODE_ARGMAX_Y", +}; + +static const char* privilege_level_str[] = +{ + "PRIVILEGE_LEVEL_USER", + "PRIVILEGE_LEVEL_PRIVILEGED", +}; + +static const char* ram_id_str[] = +{ + "RAM_ID_LUT", + "RAM_ID_IB", + "RAM_ID_AB", + "RAM_ID_CB", + "RAM_ID_OB", +}; + +static const char* resize_mode_str[] = +{ + "RESIZE_MODE_BILINEAR", + "RESIZE_MODE_REPLICATE", + "RESIZE_MODE_NEAREST", +}; + +static const char* round_mode_ifm_str[] = +{ + "ROUND_MODE_IFM_DOUBLE_SYMMETRIC", + "ROUND_MODE_IFM_NATURAL", +}; + +static const char* round_mode_ofm_str[] = +{ + "ROUND_MODE_OFM_DOUBLE_SYMMETRIC", + "ROUND_MODE_OFM_NATURAL", + "ROUND_MODE_OFM_DOUBLE_ASYMMETRIC", + "ROUND_MODE_OFM_SYMMETRIC", + "ROUND_MODE_OFM_TRUNCATE_TO_ZERO", + "ROUND_MODE_OFM_TRUNCATE_TO_LOWER", +}; + +static const char* security_level_str[] = +{ + "SECURITY_LEVEL_SECURE", + "SECURITY_LEVEL_NON_SECURE", +}; + +static const char* state_str[] = +{ + "STATE_STOPPED", + "STATE_RUNNING", +}; + +static const char* wd_active_core_str[] = +{ + "WD_ACTIVE_CORE_NONE", + "WD_ACTIVE_CORE_STANDARD", + "WD_ACTIVE_CORE_FAST", + "WD_ACTIVE_CORE_TENSOR", +}; + +static const char* weight_format_str[] = +{ + "WEIGHT_FORMAT_SWD", + "WEIGHT_FORMAT_FWD", +}; + +static const char* weight_order_str[] = +{ + "WEIGHT_ORDER_DEPTH_FIRST", + "WEIGHT_ORDER_PART_KERNEL_FIRST", +}; + +static const char* weight_sparsity_str[] = +{ + "WEIGHT_SPARSITY_NONE", + "WEIGHT_SPARSITY_SPARSE_2_4", +}; + +#endif + + + +struct id_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t version_status : 4; + uint32_t version_minor : 4; + uint32_t version_major : 4; + uint32_t product_major : 4; + uint32_t arch_patch_rev : 4; + uint32_t arch_minor_rev : 8; + uint32_t arch_major_rev : 4; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR id_r() : + word0(536899584) + {} + CONSTEXPR id_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + id_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_version_status() const + { + auto v = ((1U << 4) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR id_r& set_version_status(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_version_minor() const + { + auto v = ((1U << 4) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR id_r& set_version_minor(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_version_major() const + { + auto v = ((1U << 4) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR id_r& set_version_major(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<8) & word0) | ((((1U << 4) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_product_major() const + { + auto v = ((1U << 4) - 1) & (word0 >> 12); + return v; + } + CONSTEXPR id_r& set_product_major(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<12) & word0) | ((((1U << 4) - 1) & value) << 12); + return *this; + } + CONSTEXPR uint32_t get_arch_patch_rev() const + { + auto v = ((1U << 4) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR id_r& set_arch_patch_rev(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<16) & word0) | ((((1U << 4) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_arch_minor_rev() const + { + auto v = ((1U << 8) - 1) & (word0 >> 20); + return v; + } + CONSTEXPR id_r& set_arch_minor_rev(uint32_t value) + { + word0 = (~(((1U << 8) - 1)<<20) & word0) | ((((1U << 8) - 1) & value) << 20); + return *this; + } + CONSTEXPR uint32_t get_arch_major_rev() const + { + auto v = ((1U << 4) - 1) & (word0 >> 28); + return v; + } + CONSTEXPR id_r& set_arch_major_rev(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<28) & word0) | ((((1U << 4) - 1) & value) << 28); + return *this; + } +#endif +}; + + +struct status_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t state : 1; + uint32_t irq_raised : 1; + uint32_t bus_status : 1; + uint32_t reset_status : 1; + uint32_t cmd_parse_error : 1; + uint32_t cmd_end_reached : 1; + uint32_t pmu_irq_raised : 1; + uint32_t reserved0 : 1; + uint32_t ecc_fault : 1; + uint32_t branch_fault : 1; + uint32_t reserved1 : 1; + uint32_t faulting_interface : 1; + uint32_t faulting_channel : 4; + uint32_t irq_history_mask : 16; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR status_r() : + word0(8) + {} + CONSTEXPR status_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + status_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::state get_state() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR status_r& set_state(NPU_NAMESPACE::state value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR uint32_t get_irq_raised() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR status_r& set_irq_raised(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_bus_status() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR status_r& set_bus_status(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_reset_status() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR status_r& set_reset_status(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_cmd_parse_error() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR status_r& set_cmd_parse_error(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_cmd_end_reached() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR status_r& set_cmd_end_reached(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_pmu_irq_raised() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR status_r& set_pmu_irq_raised(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_ecc_fault() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR status_r& set_ecc_fault(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_branch_fault() const + { + auto v = ((1U << 1) - 1) & (word0 >> 9); + return v; + } + CONSTEXPR status_r& set_branch_fault(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_fault_src get_faulting_interface() const + { + auto v = ((1U << 1) - 1) & (word0 >> 11); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR status_r& set_faulting_interface(NPU_NAMESPACE::dma_fault_src value) + { + word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 11); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_fault_channel get_faulting_channel() const + { + auto v = ((1U << 4) - 1) & (word0 >> 12); + assert(v <= 9); + return static_cast(v); + } + CONSTEXPR status_r& set_faulting_channel(NPU_NAMESPACE::dma_fault_channel value) + { + word0 = (~(((1U << 4) - 1)<<12) & word0) | ((((1U << 4) - 1) & static_cast(value)) << 12); + return *this; + } + CONSTEXPR uint32_t get_irq_history_mask() const + { + auto v = ((1U << 16) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR status_r& set_irq_history_mask(uint32_t value) + { + word0 = (~(((1U << 16) - 1)<<16) & word0) | ((((1U << 16) - 1) & value) << 16); + return *this; + } +#endif +}; + + +struct cmd_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t transition_to_running_state : 1; + uint32_t clear_irq : 1; + uint32_t clock_q_enable : 1; + uint32_t power_q_enable : 1; + uint32_t stop_request : 1; + uint32_t reserved0 : 11; + uint32_t clear_irq_history : 16; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cmd_r() : + word0(12) + {} + CONSTEXPR cmd_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cmd_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_transition_to_running_state() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR cmd_r& set_transition_to_running_state(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_clear_irq() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR cmd_r& set_clear_irq(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_clock_q_enable() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR cmd_r& set_clock_q_enable(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_power_q_enable() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR cmd_r& set_power_q_enable(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_stop_request() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR cmd_r& set_stop_request(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_clear_irq_history() const + { + auto v = ((1U << 16) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR cmd_r& set_clear_irq_history(uint32_t value) + { + word0 = (~(((1U << 16) - 1)<<16) & word0) | ((((1U << 16) - 1) & value) << 16); + return *this; + } +#endif +}; + + +struct reset_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t pending_CPL : 1; + uint32_t pending_CSL : 1; + uint32_t reserved0 : 30; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR reset_r() : + word0(0) + {} + CONSTEXPR reset_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + reset_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::privilege_level get_pending_CPL() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR reset_r& set_pending_CPL(NPU_NAMESPACE::privilege_level value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::security_level get_pending_CSL() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR reset_r& set_pending_CSL(NPU_NAMESPACE::security_level value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 1); + return *this; + } +#endif +}; + + +struct qbase_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR qbase_r() : + word0(0), + word1(0) + {} + CONSTEXPR qbase_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + qbase_r copy() + { + return *this; + } +#endif +}; + + +struct qread_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t QREAD : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR qread_r() : + word0(0) + {} + CONSTEXPR qread_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + qread_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_QREAD() const + { + auto v = word0; + return v; + } + CONSTEXPR qread_r& set_QREAD(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct qconfig_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t cmd_region0 : 2; + uint32_t reserved0 : 30; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR qconfig_r() : + word0(0) + {} + CONSTEXPR qconfig_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + qconfig_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_cmd_region0() const + { + auto v = ((1U << 2) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR qconfig_r& set_cmd_region0(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & value) << 0); + return *this; + } +#endif +}; + + +struct qsize_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t QSIZE : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR qsize_r() : + word0(0) + {} + CONSTEXPR qsize_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + qsize_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_QSIZE() const + { + auto v = word0; + return v; + } + CONSTEXPR qsize_r& set_QSIZE(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct prot_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t active_CPL : 1; + uint32_t active_CSL : 1; + uint32_t reserved0 : 30; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR prot_r() : + word0(0) + {} + CONSTEXPR prot_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + prot_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::privilege_level get_active_CPL() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR prot_r& set_active_CPL(NPU_NAMESPACE::privilege_level value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::security_level get_active_CSL() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR prot_r& set_active_CSL(NPU_NAMESPACE::security_level value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 1); + return *this; + } +#endif +}; + + +struct config_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t macs_per_cc : 4; + uint32_t cmd_stream_version : 4; + uint32_t num_axi_sram : 2; + uint32_t num_axi_ext : 1; + uint32_t reserved0 : 1; + uint32_t num_wd : 2; + uint32_t reserved1 : 13; + uint32_t custom_dma : 1; + uint32_t product : 4; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR config_r() : + word0(536870928) + {} + CONSTEXPR config_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + config_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_macs_per_cc() const + { + auto v = ((1U << 4) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR config_r& set_macs_per_cc(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_cmd_stream_version() const + { + auto v = ((1U << 4) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR config_r& set_cmd_stream_version(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_num_axi_sram() const + { + auto v = ((1U << 2) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR config_r& set_num_axi_sram(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<8) & word0) | ((((1U << 2) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_num_axi_ext() const + { + auto v = ((1U << 1) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR config_r& set_num_axi_ext(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10); + return *this; + } + CONSTEXPR uint32_t get_num_wd() const + { + auto v = ((1U << 2) - 1) & (word0 >> 12); + return v; + } + CONSTEXPR config_r& set_num_wd(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<12) & word0) | ((((1U << 2) - 1) & value) << 12); + return *this; + } + CONSTEXPR NPU_NAMESPACE::custom_dma get_custom_dma() const + { + auto v = ((1U << 1) - 1) & (word0 >> 27); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR config_r& set_custom_dma(NPU_NAMESPACE::custom_dma value) + { + word0 = (~(((1U << 1) - 1)<<27) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 27); + return *this; + } + CONSTEXPR uint32_t get_product() const + { + auto v = ((1U << 4) - 1) & (word0 >> 28); + return v; + } + CONSTEXPR config_r& set_product(uint32_t value) + { + word0 = (~(((1U << 4) - 1)<<28) & word0) | ((((1U << 4) - 1) & value) << 28); + return *this; + } +#endif +}; + + +struct cond_status_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t result_flag : 1; + uint32_t reserved0 : 31; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cond_status_r() : + word0(0) + {} + CONSTEXPR cond_status_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cond_status_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_result_flag() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR cond_status_r& set_result_flag(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } +#endif +}; + + +struct power_ctrl_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t mac_step_cycles : 6; + uint32_t reserved0 : 26; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR power_ctrl_r() : + word0(0) + {} + CONSTEXPR power_ctrl_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + power_ctrl_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_mac_step_cycles() const + { + auto v = ((1U << 6) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR power_ctrl_r& set_mac_step_cycles(uint32_t value) + { + word0 = (~(((1U << 6) - 1)<<0) & word0) | ((((1U << 6) - 1) & value) << 0); + return *this; + } +#endif +}; + + +struct regioncfg_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t region0 : 2; + uint32_t region1 : 2; + uint32_t region2 : 2; + uint32_t region3 : 2; + uint32_t region4 : 2; + uint32_t region5 : 2; + uint32_t region6 : 2; + uint32_t region7 : 2; + uint32_t reserved0 : 16; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR regioncfg_r() : + word0(0) + {} + CONSTEXPR regioncfg_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + regioncfg_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_region0() const + { + auto v = ((1U << 2) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR regioncfg_r& set_region0(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_region1() const + { + auto v = ((1U << 2) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR regioncfg_r& set_region1(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<2) & word0) | ((((1U << 2) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_region2() const + { + auto v = ((1U << 2) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR regioncfg_r& set_region2(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<4) & word0) | ((((1U << 2) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_region3() const + { + auto v = ((1U << 2) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR regioncfg_r& set_region3(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<6) & word0) | ((((1U << 2) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_region4() const + { + auto v = ((1U << 2) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR regioncfg_r& set_region4(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<8) & word0) | ((((1U << 2) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_region5() const + { + auto v = ((1U << 2) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR regioncfg_r& set_region5(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<10) & word0) | ((((1U << 2) - 1) & value) << 10); + return *this; + } + CONSTEXPR uint32_t get_region6() const + { + auto v = ((1U << 2) - 1) & (word0 >> 12); + return v; + } + CONSTEXPR regioncfg_r& set_region6(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<12) & word0) | ((((1U << 2) - 1) & value) << 12); + return *this; + } + CONSTEXPR uint32_t get_region7() const + { + auto v = ((1U << 2) - 1) & (word0 >> 14); + return v; + } + CONSTEXPR regioncfg_r& set_region7(uint32_t value) + { + word0 = (~(((1U << 2) - 1)<<14) & word0) | ((((1U << 2) - 1) & value) << 14); + return *this; + } +#endif +}; + + +struct mem_attr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t mem_domain : 2; + uint32_t axi_port : 1; + uint32_t reserved0 : 1; + uint32_t memtype : 4; + uint32_t reserved1 : 24; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR mem_attr_r() : + word0(0) + {} + CONSTEXPR mem_attr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + mem_attr_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::axi_mem_domain get_mem_domain() const + { + auto v = ((1U << 2) - 1) & (word0 >> 0); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR mem_attr_r& set_mem_domain(NPU_NAMESPACE::axi_mem_domain value) + { + word0 = (~(((1U << 2) - 1)<<0) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::axi_port get_axi_port() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR mem_attr_r& set_axi_port(NPU_NAMESPACE::axi_port value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 2); + return *this; + } + CONSTEXPR NPU_NAMESPACE::axi_mem_encoding get_memtype() const + { + auto v = ((1U << 4) - 1) & (word0 >> 4); + assert(v <= 11); + return static_cast(v); + } + CONSTEXPR mem_attr_r& set_memtype(NPU_NAMESPACE::axi_mem_encoding value) + { + word0 = (~(((1U << 4) - 1)<<4) & word0) | ((((1U << 4) - 1) & static_cast(value)) << 4); + return *this; + } +#endif +}; + + +struct axi_sram_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t max_outstanding_read_m1 : 6; + uint32_t reserved0 : 2; + uint32_t max_outstanding_write_m1 : 5; + uint32_t reserved1 : 3; + uint32_t max_beats : 2; + uint32_t reserved2 : 14; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR axi_sram_r() : + word0(0) + {} + CONSTEXPR axi_sram_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + axi_sram_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_read_m1() const + { + auto v = ((1U << 6) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR axi_sram_r& set_max_outstanding_read_m1(uint32_t value) + { + word0 = (~(((1U << 6) - 1)<<0) & word0) | ((((1U << 6) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_write_m1() const + { + auto v = ((1U << 5) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR axi_sram_r& set_max_outstanding_write_m1(uint32_t value) + { + word0 = (~(((1U << 5) - 1)<<8) & word0) | ((((1U << 5) - 1) & value) << 8); + return *this; + } + CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const + { + auto v = ((1U << 2) - 1) & (word0 >> 16); + assert(v <= 2); + return static_cast(v); + } + CONSTEXPR axi_sram_r& set_max_beats(NPU_NAMESPACE::max_beats value) + { + word0 = (~(((1U << 2) - 1)<<16) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 16); + return *this; + } +#endif +}; + + +struct axi_ext_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t max_outstanding_read_m1 : 6; + uint32_t reserved0 : 2; + uint32_t max_outstanding_write_m1 : 5; + uint32_t reserved1 : 3; + uint32_t max_beats : 2; + uint32_t reserved2 : 14; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR axi_ext_r() : + word0(0) + {} + CONSTEXPR axi_ext_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + axi_ext_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_read_m1() const + { + auto v = ((1U << 6) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR axi_ext_r& set_max_outstanding_read_m1(uint32_t value) + { + word0 = (~(((1U << 6) - 1)<<0) & word0) | ((((1U << 6) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_write_m1() const + { + auto v = ((1U << 5) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR axi_ext_r& set_max_outstanding_write_m1(uint32_t value) + { + word0 = (~(((1U << 5) - 1)<<8) & word0) | ((((1U << 5) - 1) & value) << 8); + return *this; + } + CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const + { + auto v = ((1U << 2) - 1) & (word0 >> 16); + assert(v <= 2); + return static_cast(v); + } + CONSTEXPR axi_ext_r& set_max_beats(NPU_NAMESPACE::max_beats value) + { + word0 = (~(((1U << 2) - 1)<<16) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 16); + return *this; + } +#endif +}; + + +struct cfg_sram_cap_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t max_outstanding_read_m1 : 6; + uint32_t reserved0 : 2; + uint32_t max_outstanding_write_m1 : 5; + uint32_t reserved1 : 3; + uint32_t max_beats : 2; + uint32_t reserved2 : 14; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cfg_sram_cap_r() : + word0(0) + {} + CONSTEXPR cfg_sram_cap_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cfg_sram_cap_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_read_m1() const + { + auto v = ((1U << 6) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR cfg_sram_cap_r& set_max_outstanding_read_m1(uint32_t value) + { + word0 = (~(((1U << 6) - 1)<<0) & word0) | ((((1U << 6) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_write_m1() const + { + auto v = ((1U << 5) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR cfg_sram_cap_r& set_max_outstanding_write_m1(uint32_t value) + { + word0 = (~(((1U << 5) - 1)<<8) & word0) | ((((1U << 5) - 1) & value) << 8); + return *this; + } + CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const + { + auto v = ((1U << 2) - 1) & (word0 >> 16); + assert(v <= 2); + return static_cast(v); + } + CONSTEXPR cfg_sram_cap_r& set_max_beats(NPU_NAMESPACE::max_beats value) + { + word0 = (~(((1U << 2) - 1)<<16) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 16); + return *this; + } +#endif +}; + + +struct cfg_ext_cap_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t max_outstanding_read_m1 : 6; + uint32_t reserved0 : 2; + uint32_t max_outstanding_write_m1 : 5; + uint32_t reserved1 : 3; + uint32_t max_beats : 2; + uint32_t reserved2 : 14; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cfg_ext_cap_r() : + word0(0) + {} + CONSTEXPR cfg_ext_cap_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cfg_ext_cap_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_read_m1() const + { + auto v = ((1U << 6) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR cfg_ext_cap_r& set_max_outstanding_read_m1(uint32_t value) + { + word0 = (~(((1U << 6) - 1)<<0) & word0) | ((((1U << 6) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_max_outstanding_write_m1() const + { + auto v = ((1U << 5) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR cfg_ext_cap_r& set_max_outstanding_write_m1(uint32_t value) + { + word0 = (~(((1U << 5) - 1)<<8) & word0) | ((((1U << 5) - 1) & value) << 8); + return *this; + } + CONSTEXPR NPU_NAMESPACE::max_beats get_max_beats() const + { + auto v = ((1U << 2) - 1) & (word0 >> 16); + assert(v <= 2); + return static_cast(v); + } + CONSTEXPR cfg_ext_cap_r& set_max_beats(NPU_NAMESPACE::max_beats value) + { + word0 = (~(((1U << 2) - 1)<<16) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 16); + return *this; + } +#endif +}; + + +struct cfg_sram_hash0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t zero : 6; + uint32_t hash_LO : 26; + uint32_t hash_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR cfg_sram_hash0_r() : + word0(0), + word1(0) + {} + CONSTEXPR cfg_sram_hash0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + cfg_sram_hash0_r copy() + { + return *this; + } +#endif +}; + + +struct cfg_sram_hash1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t zero : 6; + uint32_t hash_LO : 26; + uint32_t hash_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR cfg_sram_hash1_r() : + word0(0), + word1(0) + {} + CONSTEXPR cfg_sram_hash1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + cfg_sram_hash1_r copy() + { + return *this; + } +#endif +}; + + +struct cfg_ext_hash0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t zero : 6; + uint32_t hash_LO : 26; + uint32_t hash_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR cfg_ext_hash0_r() : + word0(0), + word1(0) + {} + CONSTEXPR cfg_ext_hash0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + cfg_ext_hash0_r copy() + { + return *this; + } +#endif +}; + + +struct basep_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR basep_r() : + word0(0), + word1(0) + {} + CONSTEXPR basep_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + basep_r copy() + { + return *this; + } +#endif +}; + + +struct clkforce_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t top_level_clk : 1; + uint32_t cc_clk : 1; + uint32_t dma_clk : 1; + uint32_t mac_clk : 1; + uint32_t ao_clk : 1; + uint32_t wd_clk : 1; + uint32_t reserved0 : 26; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR clkforce_r() : + word0(0) + {} + CONSTEXPR clkforce_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + clkforce_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_top_level_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR clkforce_r& set_top_level_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_cc_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR clkforce_r& set_cc_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_dma_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR clkforce_r& set_dma_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_mac_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR clkforce_r& set_mac_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_ao_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR clkforce_r& set_ao_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_wd_clk() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR clkforce_r& set_wd_clk(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } +#endif +}; + + +struct debug_address_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t addr : 28; + uint32_t ram_id : 4; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR debug_address_r() : + word0(0) + {} + CONSTEXPR debug_address_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + debug_address_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_addr() const + { + auto v = ((1U << 28) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR debug_address_r& set_addr(uint32_t value) + { + word0 = (~(((1U << 28) - 1)<<0) & word0) | ((((1U << 28) - 1) & value) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::ram_id get_ram_id() const + { + auto v = ((1U << 4) - 1) & (word0 >> 28); + assert(v <= 4); + return static_cast(v); + } + CONSTEXPR debug_address_r& set_ram_id(NPU_NAMESPACE::ram_id value) + { + word0 = (~(((1U << 4) - 1)<<28) & word0) | ((((1U << 4) - 1) & static_cast(value)) << 28); + return *this; + } +#endif +}; + + +struct debug_misc_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t misc : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR debug_misc_r() : + word0(0) + {} + CONSTEXPR debug_misc_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + debug_misc_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_misc() const + { + auto v = word0; + return v; + } + CONSTEXPR debug_misc_r& set_misc(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma_ifm_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_ifm_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_ifm_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_ifm_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_ifm_dst_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma_ifm_dst_r() : + word0(0) + {} + CONSTEXPR dma_ifm_dst_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma_ifm_dst_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma_ifm_dst_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma_ofm_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma_ofm_src_r() : + word0(0) + {} + CONSTEXPR dma_ofm_src_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma_ofm_src_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma_ofm_src_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma_ofm_dst_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_ofm_dst_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_ofm_dst_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_ofm_dst_r copy() + { + return *this; + } +#endif +}; + + +struct dma_weight_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_weight_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_weight_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_weight_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_cmd_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_cmd_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_cmd_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_cmd_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_cmd_size_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma_cmd_size_r() : + word0(0) + {} + CONSTEXPR dma_cmd_size_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma_cmd_size_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma_cmd_size_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma_m2m_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_m2m_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_m2m_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_m2m_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_m2m_dst_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_m2m_dst_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_m2m_dst_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_m2m_dst_r copy() + { + return *this; + } +#endif +}; + + +struct current_qread_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR current_qread_r() : + word0(0) + {} + CONSTEXPR current_qread_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + current_qread_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR current_qread_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma_scale_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_scale_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_scale_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_scale_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_weight1_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_weight1_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_weight1_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_weight1_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_weight2_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_weight2_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_weight2_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_weight2_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma_weight3_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t offset_LO : 32; + uint32_t offset_HI : 8; + uint32_t reserved0 : 24; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma_weight3_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma_weight3_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma_weight3_src_r copy() + { + return *this; + } +#endif +}; + + +struct current_op_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR current_op_r() : + word0(0) + {} + CONSTEXPR current_op_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + current_op_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR current_op_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct current_cmd_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR current_cmd_r() : + word0(0) + {} + CONSTEXPR current_cmd_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + current_cmd_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR current_cmd_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct internal_memory_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t mem_word : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR internal_memory_r() : + word0(0) + {} + CONSTEXPR internal_memory_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + internal_memory_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_mem_word() const + { + auto v = word0; + return v; + } + CONSTEXPR internal_memory_r& set_mem_word(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_pad_top_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_pad_top_r() : + word0(0) + {} + CONSTEXPR ifm_pad_top_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_pad_top_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_pad_top_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_pad_left_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_pad_left_r() : + word0(0) + {} + CONSTEXPR ifm_pad_left_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_pad_left_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_pad_left_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_pad_right_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_pad_right_r() : + word0(0) + {} + CONSTEXPR ifm_pad_right_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_pad_right_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_pad_right_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_pad_bottom_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_pad_bottom_r() : + word0(0) + {} + CONSTEXPR ifm_pad_bottom_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_pad_bottom_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_pad_bottom_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_depth_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_depth_m1_r() : + word0(0) + {} + CONSTEXPR ifm_depth_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_depth_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_depth_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_precision_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_precision_r() : + word0(0) + {} + CONSTEXPR ifm_precision_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_precision_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_precision_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_upscale_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_upscale_r() : + word0(0) + {} + CONSTEXPR ifm_upscale_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_upscale_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_upscale_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_broadcast_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_broadcast_r() : + word0(0) + {} + CONSTEXPR ifm_broadcast_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_broadcast_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_broadcast_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_zero_point_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_zero_point_r() : + word0(0) + {} + CONSTEXPR ifm_zero_point_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_zero_point_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_zero_point_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_width0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_width0_m1_r() : + word0(0) + {} + CONSTEXPR ifm_width0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_width0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_width0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_height0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_height0_m1_r() : + word0(0) + {} + CONSTEXPR ifm_height0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_height0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_height0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_height1_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_height1_m1_r() : + word0(0) + {} + CONSTEXPR ifm_height1_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_height1_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_height1_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm_region_r() : + word0(0) + {} + CONSTEXPR ifm_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_width_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_width_m1_r() : + word0(0) + {} + CONSTEXPR ofm_width_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_width_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_width_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_height_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_height_m1_r() : + word0(0) + {} + CONSTEXPR ofm_height_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_height_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_height_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_depth_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_depth_m1_r() : + word0(0) + {} + CONSTEXPR ofm_depth_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_depth_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_depth_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_precision_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_precision_r() : + word0(0) + {} + CONSTEXPR ofm_precision_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_precision_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_precision_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_blk_width_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_blk_width_m1_r() : + word0(0) + {} + CONSTEXPR ofm_blk_width_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_blk_width_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_blk_width_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_blk_height_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_blk_height_m1_r() : + word0(0) + {} + CONSTEXPR ofm_blk_height_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_blk_height_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_blk_height_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_blk_depth_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_blk_depth_m1_r() : + word0(0) + {} + CONSTEXPR ofm_blk_depth_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_blk_depth_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_blk_depth_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_zero_point_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_zero_point_r() : + word0(0) + {} + CONSTEXPR ofm_zero_point_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_zero_point_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_zero_point_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_width0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_width0_m1_r() : + word0(0) + {} + CONSTEXPR ofm_width0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_width0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_width0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_height0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_height0_m1_r() : + word0(0) + {} + CONSTEXPR ofm_height0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_height0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_height0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_height1_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_height1_m1_r() : + word0(0) + {} + CONSTEXPR ofm_height1_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_height1_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_height1_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ofm_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ofm_region_r() : + word0(0) + {} + CONSTEXPR ofm_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ofm_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ofm_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct kernel_width_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR kernel_width_m1_r() : + word0(0) + {} + CONSTEXPR kernel_width_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + kernel_width_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR kernel_width_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct kernel_height_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR kernel_height_m1_r() : + word0(0) + {} + CONSTEXPR kernel_height_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + kernel_height_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR kernel_height_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct kernel_stride_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR kernel_stride_r() : + word0(0) + {} + CONSTEXPR kernel_stride_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + kernel_stride_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR kernel_stride_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct acc_format_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR acc_format_r() : + word0(0) + {} + CONSTEXPR acc_format_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + acc_format_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR acc_format_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct activation_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR activation_r() : + word0(0) + {} + CONSTEXPR activation_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + activation_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR activation_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct activation_min_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR activation_min_r() : + word0(0) + {} + CONSTEXPR activation_min_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + activation_min_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR activation_min_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct activation_max_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR activation_max_r() : + word0(0) + {} + CONSTEXPR activation_max_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + activation_max_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR activation_max_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct weight_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR weight_region_r() : + word0(0) + {} + CONSTEXPR weight_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + weight_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR weight_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct scale_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR scale_region_r() : + word0(0) + {} + CONSTEXPR scale_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + scale_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR scale_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct resize_x_scale_n_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR resize_x_scale_n_m1_r() : + word0(0) + {} + CONSTEXPR resize_x_scale_n_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + resize_x_scale_n_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR resize_x_scale_n_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct resize_y_scale_n_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR resize_y_scale_n_m1_r() : + word0(0) + {} + CONSTEXPR resize_y_scale_n_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + resize_y_scale_n_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR resize_y_scale_n_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct resize_x_offset_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR resize_x_offset_r() : + word0(0) + {} + CONSTEXPR resize_x_offset_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + resize_x_offset_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR resize_x_offset_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct resize_y_offset_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR resize_y_offset_r() : + word0(0) + {} + CONSTEXPR resize_y_offset_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + resize_y_offset_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR resize_y_offset_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct weight_format_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR weight_format_r() : + word0(0) + {} + CONSTEXPR weight_format_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + weight_format_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR weight_format_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct blockdep_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR blockdep_r() : + word0(0) + {} + CONSTEXPR blockdep_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + blockdep_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR blockdep_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma0_src_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma0_src_region_r() : + word0(0) + {} + CONSTEXPR dma0_src_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma0_src_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma0_src_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma0_dst_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma0_dst_region_r() : + word0(0) + {} + CONSTEXPR dma0_dst_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma0_dst_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma0_dst_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma0_size0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma0_size0_r() : + word0(0) + {} + CONSTEXPR dma0_size0_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma0_size0_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma0_size0_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma0_size1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma0_size1_r() : + word0(0) + {} + CONSTEXPR dma0_size1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma0_size1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma0_size1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct dma0_idx_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma0_idx_region_r() : + word0(0) + {} + CONSTEXPR dma0_idx_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma0_idx_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR dma0_idx_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_broadcast_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_broadcast_r() : + word0(0) + {} + CONSTEXPR ifm2_broadcast_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_broadcast_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_broadcast_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_precision_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_precision_r() : + word0(0) + {} + CONSTEXPR ifm2_precision_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_precision_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_precision_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_zero_point_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_zero_point_r() : + word0(0) + {} + CONSTEXPR ifm2_zero_point_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_zero_point_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_zero_point_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_width0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_width0_m1_r() : + word0(0) + {} + CONSTEXPR ifm2_width0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_width0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_width0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_height0_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_height0_m1_r() : + word0(0) + {} + CONSTEXPR ifm2_height0_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_height0_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_height0_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_height1_m1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_height1_m1_r() : + word0(0) + {} + CONSTEXPR ifm2_height1_m1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_height1_m1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_height1_m1_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm2_region_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ifm2_region_r() : + word0(0) + {} + CONSTEXPR ifm2_region_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ifm2_region_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR ifm2_region_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct ifm_base0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_base0_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_base0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_base0_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_base1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_base1_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_base1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_base1_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_base2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_base2_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_base2_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_base2_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_base3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_base3_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_base3_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_base3_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_stride_x_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_stride_x_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_stride_x_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_stride_x_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_stride_y_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_stride_y_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_stride_y_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_stride_y_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_stride_c_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_stride_c_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_stride_c_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_stride_c_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_base0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_base0_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_base0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_base0_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_base1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_base1_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_base1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_base1_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_base2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_base2_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_base2_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_base2_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_base3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_base3_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_base3_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_base3_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_stride_x_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_stride_x_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_stride_x_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_stride_x_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_stride_y_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_stride_y_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_stride_y_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_stride_y_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_stride_c_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_stride_c_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_stride_c_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_stride_c_r copy() + { + return *this; + } +#endif +}; + + +struct weight_base_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight_base_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight_base_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight_base_r copy() + { + return *this; + } +#endif +}; + + +struct weight_length_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight_length_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight_length_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight_length_r copy() + { + return *this; + } +#endif +}; + + +struct scale_base_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR scale_base_r() : + word0(0), + word1(0) + {} + CONSTEXPR scale_base_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + scale_base_r copy() + { + return *this; + } +#endif +}; + + +struct scale_length_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR scale_length_r() : + word0(0), + word1(0) + {} + CONSTEXPR scale_length_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + scale_length_r copy() + { + return *this; + } +#endif +}; + + +struct ofm_scale_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ofm_scale_r() : + word0(0), + word1(0) + {} + CONSTEXPR ofm_scale_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ofm_scale_r copy() + { + return *this; + } +#endif +}; + + +struct ifm_scale_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm_scale_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm_scale_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm_scale_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_scale_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_scale_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_scale_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_scale_r copy() + { + return *this; + } +#endif +}; + + +struct op_scalar_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR op_scalar_r() : + word0(0), + word1(0) + {} + CONSTEXPR op_scalar_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + op_scalar_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_src_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_src_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_src_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_src_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_dst_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_dst_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_dst_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_dst_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_len_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_len_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_len_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_len_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_src_stride0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_src_stride0_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_src_stride0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_src_stride0_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_src_stride1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_src_stride1_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_src_stride1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_src_stride1_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_dst_stride0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_dst_stride0_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_dst_stride0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_dst_stride0_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_dst_stride1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_dst_stride1_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_dst_stride1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_dst_stride1_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_idx_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_idx_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_idx_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_idx_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_base0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_base0_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_base0_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_base0_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_base1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_base1_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_base1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_base1_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_base2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_base2_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_base2_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_base2_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_base3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_base3_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_base3_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_base3_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_stride_x_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_stride_x_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_stride_x_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_stride_x_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_stride_y_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_stride_y_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_stride_y_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_stride_y_r copy() + { + return *this; + } +#endif +}; + + +struct ifm2_stride_c_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR ifm2_stride_c_r() : + word0(0), + word1(0) + {} + CONSTEXPR ifm2_stride_c_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + ifm2_stride_c_r copy() + { + return *this; + } +#endif +}; + + +struct weight1_base_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight1_base_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight1_base_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight1_base_r copy() + { + return *this; + } +#endif +}; + + +struct weight1_length_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight1_length_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight1_length_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight1_length_r copy() + { + return *this; + } +#endif +}; + + +struct weight2_base_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight2_base_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight2_base_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight2_base_r copy() + { + return *this; + } +#endif +}; + + +struct weight2_length_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight2_length_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight2_length_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight2_length_r copy() + { + return *this; + } +#endif +}; + + +struct weight3_base_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight3_base_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight3_base_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight3_base_r copy() + { + return *this; + } +#endif +}; + + +struct weight3_length_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR weight3_length_r() : + word0(0), + word1(0) + {} + CONSTEXPR weight3_length_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + weight3_length_r copy() + { + return *this; + } +#endif +}; + + +struct resize_x_step_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR resize_x_step_r() : + word0(0), + word1(0) + {} + CONSTEXPR resize_x_step_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + resize_x_step_r copy() + { + return *this; + } +#endif +}; + + +struct resize_y_step_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR resize_y_step_r() : + word0(0), + word1(0) + {} + CONSTEXPR resize_y_step_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + resize_y_step_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_idx_max_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_idx_max_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_idx_max_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_idx_max_r copy() + { + return *this; + } +#endif +}; + + +struct dma0_idx_skip1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value_LO : 32; + uint32_t value_HI : 32; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR dma0_idx_skip1_r() : + word0(0), + word1(0) + {} + CONSTEXPR dma0_idx_skip1_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + dma0_idx_skip1_r copy() + { + return *this; + } +#endif +}; + + +struct revision_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t value : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR revision_r() : + word0(0) + {} + CONSTEXPR revision_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + revision_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_value() const + { + auto v = word0; + return v; + } + CONSTEXPR revision_r& set_value(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid4_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID4 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid4_r() : + word0(4) + {} + CONSTEXPR pid4_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid4_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID4() const + { + auto v = word0; + return v; + } + CONSTEXPR pid4_r& set_PID4(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid5_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID5 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid5_r() : + word0(0) + {} + CONSTEXPR pid5_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid5_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID5() const + { + auto v = word0; + return v; + } + CONSTEXPR pid5_r& set_PID5(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid6_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID6 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid6_r() : + word0(0) + {} + CONSTEXPR pid6_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid6_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID6() const + { + auto v = word0; + return v; + } + CONSTEXPR pid6_r& set_PID6(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid7_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID7 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid7_r() : + word0(0) + {} + CONSTEXPR pid7_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid7_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID7() const + { + auto v = word0; + return v; + } + CONSTEXPR pid7_r& set_PID7(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID0 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid0_r() : + word0(130) + {} + CONSTEXPR pid0_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid0_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID0() const + { + auto v = word0; + return v; + } + CONSTEXPR pid0_r& set_PID0(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID1 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid1_r() : + word0(181) + {} + CONSTEXPR pid1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID1() const + { + auto v = word0; + return v; + } + CONSTEXPR pid1_r& set_PID1(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID2 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid2_r() : + word0(11) + {} + CONSTEXPR pid2_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid2_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID2() const + { + auto v = word0; + return v; + } + CONSTEXPR pid2_r& set_PID2(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pid3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PID3 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pid3_r() : + word0(0) + {} + CONSTEXPR pid3_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pid3_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PID3() const + { + auto v = word0; + return v; + } + CONSTEXPR pid3_r& set_PID3(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct cid0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CID0 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cid0_r() : + word0(13) + {} + CONSTEXPR cid0_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cid0_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_CID0() const + { + auto v = word0; + return v; + } + CONSTEXPR cid0_r& set_CID0(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct cid1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CID1 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cid1_r() : + word0(240) + {} + CONSTEXPR cid1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cid1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_CID1() const + { + auto v = word0; + return v; + } + CONSTEXPR cid1_r& set_CID1(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct cid2_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CID2 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cid2_r() : + word0(5) + {} + CONSTEXPR cid2_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cid2_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_CID2() const + { + auto v = word0; + return v; + } + CONSTEXPR cid2_r& set_CID2(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct cid3_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CID3 : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR cid3_r() : + word0(177) + {} + CONSTEXPR cid3_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + cid3_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_CID3() const + { + auto v = word0; + return v; + } + CONSTEXPR cid3_r& set_CID3(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct wd_status_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t ctrl_idle : 1; + uint32_t reserved0 : 1; + uint32_t active_core : 2; + uint32_t sc0_idle : 1; + uint32_t sc1_idle : 1; + uint32_t sc2_idle : 1; + uint32_t sc3_idle : 1; + uint32_t fc_idle : 1; + uint32_t tc_idle : 1; + uint32_t reserved1 : 6; + uint32_t wbuf0_valid : 1; + uint32_t wbuf0_idle : 1; + uint32_t wbuf1_valid : 1; + uint32_t wbuf1_idle : 1; + uint32_t wbuf2_valid : 1; + uint32_t wbuf2_idle : 1; + uint32_t wbuf3_valid : 1; + uint32_t wbuf3_idle : 1; + uint32_t stalled_by_ws_sc0 : 1; + uint32_t stalled_by_ws_sc1 : 1; + uint32_t stalled_by_ws_sc2 : 1; + uint32_t stalled_by_ws_sc3 : 1; + uint32_t stalled_by_ws_fc : 1; + uint32_t stalled_by_ws_tc : 1; + uint32_t stalled_by_wd_buf : 1; + uint32_t reserved2 : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR wd_status_r() : + word0(0) + {} + CONSTEXPR wd_status_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + wd_status_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_ctrl_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR wd_status_r& set_ctrl_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::wd_active_core get_active_core() const + { + auto v = ((1U << 2) - 1) & (word0 >> 2); + assert(v <= 3); + return static_cast(v); + } + CONSTEXPR wd_status_r& set_active_core(NPU_NAMESPACE::wd_active_core value) + { + word0 = (~(((1U << 2) - 1)<<2) & word0) | ((((1U << 2) - 1) & static_cast(value)) << 2); + return *this; + } + CONSTEXPR uint32_t get_sc0_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR wd_status_r& set_sc0_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_sc1_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR wd_status_r& set_sc1_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_sc2_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR wd_status_r& set_sc2_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_sc3_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR wd_status_r& set_sc3_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_fc_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR wd_status_r& set_fc_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_tc_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 9); + return v; + } + CONSTEXPR wd_status_r& set_tc_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9); + return *this; + } + CONSTEXPR uint32_t get_wbuf0_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR wd_status_r& set_wbuf0_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<16) & word0) | ((((1U << 1) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_wbuf0_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 17); + return v; + } + CONSTEXPR wd_status_r& set_wbuf0_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<17) & word0) | ((((1U << 1) - 1) & value) << 17); + return *this; + } + CONSTEXPR uint32_t get_wbuf1_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 18); + return v; + } + CONSTEXPR wd_status_r& set_wbuf1_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<18) & word0) | ((((1U << 1) - 1) & value) << 18); + return *this; + } + CONSTEXPR uint32_t get_wbuf1_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 19); + return v; + } + CONSTEXPR wd_status_r& set_wbuf1_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<19) & word0) | ((((1U << 1) - 1) & value) << 19); + return *this; + } + CONSTEXPR uint32_t get_wbuf2_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 20); + return v; + } + CONSTEXPR wd_status_r& set_wbuf2_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<20) & word0) | ((((1U << 1) - 1) & value) << 20); + return *this; + } + CONSTEXPR uint32_t get_wbuf2_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 21); + return v; + } + CONSTEXPR wd_status_r& set_wbuf2_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<21) & word0) | ((((1U << 1) - 1) & value) << 21); + return *this; + } + CONSTEXPR uint32_t get_wbuf3_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 22); + return v; + } + CONSTEXPR wd_status_r& set_wbuf3_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<22) & word0) | ((((1U << 1) - 1) & value) << 22); + return *this; + } + CONSTEXPR uint32_t get_wbuf3_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 23); + return v; + } + CONSTEXPR wd_status_r& set_wbuf3_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<23) & word0) | ((((1U << 1) - 1) & value) << 23); + return *this; + } + CONSTEXPR uint32_t get_stalled_by_ws_sc0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 24); + return v; + } + CONSTEXPR wd_status_r& set_stalled_by_ws_sc0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<24) & word0) | ((((1U << 1) - 1) & value) << 24); + return *this; + } + CONSTEXPR uint32_t get_stalled_by_ws_sc1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 25); + return v; + } + CONSTEXPR wd_status_r& set_stalled_by_ws_sc1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<25) & word0) | ((((1U << 1) - 1) & value) << 25); + return *this; + } + CONSTEXPR uint32_t get_stalled_by_ws_sc2() const + { + auto v = ((1U << 1) - 1) & (word0 >> 26); + return v; + } + CONSTEXPR wd_status_r& set_stalled_by_ws_sc2(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<26) & word0) | ((((1U << 1) - 1) & value) << 26); + return *this; + } + CONSTEXPR uint32_t get_stalled_by_ws_sc3() const + { + auto v = ((1U << 1) - 1) & (word0 >> 27); + return v; + } + CONSTEXPR wd_status_r& set_stalled_by_ws_sc3(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<27) & word0) | ((((1U << 1) - 1) & value) << 27); + return *this; + } + CONSTEXPR uint32_t get_stalled_by_ws_fc() const + { + auto v = ((1U << 1) - 1) & (word0 >> 28); + return v; + } + CONSTEXPR wd_status_r& set_stalled_by_ws_fc(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<28) & word0) | ((((1U << 1) - 1) & value) << 28); + return *this; + } + CONSTEXPR uint32_t get_stalled_by_ws_tc() const + { + auto v = ((1U << 1) - 1) & (word0 >> 29); + return v; + } + CONSTEXPR wd_status_r& set_stalled_by_ws_tc(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<29) & word0) | ((((1U << 1) - 1) & value) << 29); + return *this; + } + CONSTEXPR uint32_t get_stalled_by_wd_buf() const + { + auto v = ((1U << 1) - 1) & (word0 >> 30); + return v; + } + CONSTEXPR wd_status_r& set_stalled_by_wd_buf(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<30) & word0) | ((((1U << 1) - 1) & value) << 30); + return *this; + } +#endif +}; + + +struct mac_status_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t block_cfg_valid : 1; + uint32_t trav_en : 1; + uint32_t wait_for_ib : 1; + uint32_t wait_for_acc_buf : 1; + uint32_t wait_for_weights : 1; + uint32_t stall_stripe : 1; + uint32_t dw_sel : 1; + uint32_t wait_for_dw0_ready : 1; + uint32_t wait_for_dw1_ready : 1; + uint32_t acc_buf_sel_ai : 1; + uint32_t wait_for_acc0_ready : 1; + uint32_t wait_for_acc1_ready : 1; + uint32_t acc_buf_sel_aa : 1; + uint32_t acc0_valid : 1; + uint32_t acc1_valid : 1; + uint32_t reserved0 : 1; + uint32_t events : 11; + uint32_t reserved1 : 5; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR mac_status_r() : + word0(0) + {} + CONSTEXPR mac_status_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + mac_status_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_block_cfg_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR mac_status_r& set_block_cfg_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_trav_en() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR mac_status_r& set_trav_en(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_wait_for_ib() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_ib(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_wait_for_acc_buf() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_acc_buf(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_wait_for_weights() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_weights(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_stall_stripe() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR mac_status_r& set_stall_stripe(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_dw_sel() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR mac_status_r& set_dw_sel(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_wait_for_dw0_ready() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_dw0_ready(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_wait_for_dw1_ready() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_dw1_ready(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_acc_buf_sel_ai() const + { + auto v = ((1U << 1) - 1) & (word0 >> 9); + return v; + } + CONSTEXPR mac_status_r& set_acc_buf_sel_ai(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9); + return *this; + } + CONSTEXPR uint32_t get_wait_for_acc0_ready() const + { + auto v = ((1U << 1) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_acc0_ready(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10); + return *this; + } + CONSTEXPR uint32_t get_wait_for_acc1_ready() const + { + auto v = ((1U << 1) - 1) & (word0 >> 11); + return v; + } + CONSTEXPR mac_status_r& set_wait_for_acc1_ready(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11); + return *this; + } + CONSTEXPR uint32_t get_acc_buf_sel_aa() const + { + auto v = ((1U << 1) - 1) & (word0 >> 12); + return v; + } + CONSTEXPR mac_status_r& set_acc_buf_sel_aa(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12); + return *this; + } + CONSTEXPR uint32_t get_acc0_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 13); + return v; + } + CONSTEXPR mac_status_r& set_acc0_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13); + return *this; + } + CONSTEXPR uint32_t get_acc1_valid() const + { + auto v = ((1U << 1) - 1) & (word0 >> 14); + return v; + } + CONSTEXPR mac_status_r& set_acc1_valid(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14); + return *this; + } + CONSTEXPR uint32_t get_events() const + { + auto v = ((1U << 11) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR mac_status_r& set_events(uint32_t value) + { + word0 = (~(((1U << 11) - 1)<<16) & word0) | ((((1U << 11) - 1) & value) << 16); + return *this; + } +#endif +}; + + +struct ao_status_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t ao_active : 1; + uint32_t reserved0 : 2; + uint32_t ao_stalled_by_bs_or_ob : 1; + uint32_t ao_stalled_by_bs : 1; + uint32_t ao_stalled_by_ob : 1; + uint32_t ao_stalled_by_ab_or_cb : 1; + uint32_t ao_stalled_by_ab : 1; + uint32_t ao_stalled_by_cb : 1; + uint32_t reserved1 : 23; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR ao_status_r() : + word0(0) + {} + CONSTEXPR ao_status_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + ao_status_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_ao_active() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR ao_status_r& set_ao_active(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_ao_stalled_by_bs_or_ob() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR ao_status_r& set_ao_stalled_by_bs_or_ob(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_ao_stalled_by_bs() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR ao_status_r& set_ao_stalled_by_bs(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_ao_stalled_by_ob() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR ao_status_r& set_ao_stalled_by_ob(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_ao_stalled_by_ab_or_cb() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR ao_status_r& set_ao_stalled_by_ab_or_cb(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_ao_stalled_by_ab() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR ao_status_r& set_ao_stalled_by_ab(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_ao_stalled_by_cb() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR ao_status_r& set_ao_stalled_by_cb(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8); + return *this; + } +#endif +}; + + +struct dma_status0_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t cmd_ch_idle : 1; + uint32_t ifm0_ch_idle : 1; + uint32_t ifm1_ch_idle : 1; + uint32_t wgt_ch_idle : 1; + uint32_t bas_ch_idle : 1; + uint32_t m2m_ch_idle : 1; + uint32_t ofm_ch_idle : 1; + uint32_t axi_halt_req : 1; + uint32_t axi_halt_ack : 1; + uint32_t axi_pause_req : 1; + uint32_t axi_pause_ack : 1; + uint32_t cmd_abort_ack : 1; + uint32_t cmd_abort_req : 1; + uint32_t ifm_mac_if_stall : 1; + uint32_t ifm_tc_if_stall : 1; + uint32_t ifm_ao_if_stall : 1; + uint32_t ofm_if_stall : 1; + uint32_t cmd_if_stall : 1; + uint32_t wd_sc0_if_stall : 1; + uint32_t wd_sc1_if_stall : 1; + uint32_t wd_sc2_if_stall : 1; + uint32_t wd_sc3_if_stall : 1; + uint32_t wd_fc_if_stall : 1; + uint32_t bs_if_stall : 1; + uint32_t lutcfg_if_stall : 1; + uint32_t reserved0 : 7; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma_status0_r() : + word0(0) + {} + CONSTEXPR dma_status0_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma_status0_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_cmd_ch_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR dma_status0_r& set_cmd_ch_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_ifm0_ch_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR dma_status0_r& set_ifm0_ch_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_ifm1_ch_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR dma_status0_r& set_ifm1_ch_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_wgt_ch_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR dma_status0_r& set_wgt_ch_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_bas_ch_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR dma_status0_r& set_bas_ch_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_m2m_ch_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR dma_status0_r& set_m2m_ch_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_ofm_ch_idle() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR dma_status0_r& set_ofm_ch_idle(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_axi_halt_req() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR dma_status0_r& set_axi_halt_req(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_axi_halt_ack() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR dma_status0_r& set_axi_halt_ack(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_axi_pause_req() const + { + auto v = ((1U << 1) - 1) & (word0 >> 9); + return v; + } + CONSTEXPR dma_status0_r& set_axi_pause_req(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9); + return *this; + } + CONSTEXPR uint32_t get_axi_pause_ack() const + { + auto v = ((1U << 1) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR dma_status0_r& set_axi_pause_ack(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10); + return *this; + } + CONSTEXPR uint32_t get_cmd_abort_ack() const + { + auto v = ((1U << 1) - 1) & (word0 >> 11); + return v; + } + CONSTEXPR dma_status0_r& set_cmd_abort_ack(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11); + return *this; + } + CONSTEXPR uint32_t get_cmd_abort_req() const + { + auto v = ((1U << 1) - 1) & (word0 >> 12); + return v; + } + CONSTEXPR dma_status0_r& set_cmd_abort_req(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12); + return *this; + } + CONSTEXPR uint32_t get_ifm_mac_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 13); + return v; + } + CONSTEXPR dma_status0_r& set_ifm_mac_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13); + return *this; + } + CONSTEXPR uint32_t get_ifm_tc_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 14); + return v; + } + CONSTEXPR dma_status0_r& set_ifm_tc_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14); + return *this; + } + CONSTEXPR uint32_t get_ifm_ao_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 15); + return v; + } + CONSTEXPR dma_status0_r& set_ifm_ao_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & value) << 15); + return *this; + } + CONSTEXPR uint32_t get_ofm_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR dma_status0_r& set_ofm_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<16) & word0) | ((((1U << 1) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_cmd_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 17); + return v; + } + CONSTEXPR dma_status0_r& set_cmd_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<17) & word0) | ((((1U << 1) - 1) & value) << 17); + return *this; + } + CONSTEXPR uint32_t get_wd_sc0_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 18); + return v; + } + CONSTEXPR dma_status0_r& set_wd_sc0_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<18) & word0) | ((((1U << 1) - 1) & value) << 18); + return *this; + } + CONSTEXPR uint32_t get_wd_sc1_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 19); + return v; + } + CONSTEXPR dma_status0_r& set_wd_sc1_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<19) & word0) | ((((1U << 1) - 1) & value) << 19); + return *this; + } + CONSTEXPR uint32_t get_wd_sc2_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 20); + return v; + } + CONSTEXPR dma_status0_r& set_wd_sc2_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<20) & word0) | ((((1U << 1) - 1) & value) << 20); + return *this; + } + CONSTEXPR uint32_t get_wd_sc3_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 21); + return v; + } + CONSTEXPR dma_status0_r& set_wd_sc3_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<21) & word0) | ((((1U << 1) - 1) & value) << 21); + return *this; + } + CONSTEXPR uint32_t get_wd_fc_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 22); + return v; + } + CONSTEXPR dma_status0_r& set_wd_fc_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<22) & word0) | ((((1U << 1) - 1) & value) << 22); + return *this; + } + CONSTEXPR uint32_t get_bs_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 23); + return v; + } + CONSTEXPR dma_status0_r& set_bs_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<23) & word0) | ((((1U << 1) - 1) & value) << 23); + return *this; + } + CONSTEXPR uint32_t get_lutcfg_if_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 24); + return v; + } + CONSTEXPR dma_status0_r& set_lutcfg_if_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<24) & word0) | ((((1U << 1) - 1) & value) << 24); + return *this; + } +#endif +}; + + +struct dma_status1_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t axi_sram0_ar_stalled : 1; + uint32_t axi_sram0_rd_limit_stall : 1; + uint32_t axi_sram0_aw_stalled : 1; + uint32_t axi_sram0_w_stalled : 1; + uint32_t axi_sram0_wr_limit_stall : 1; + uint32_t axi_sram1_ar_stalled : 1; + uint32_t axi_sram1_rd_limit_stall : 1; + uint32_t axi_sram1_aw_stalled : 1; + uint32_t axi_sram1_w_stalled : 1; + uint32_t axi_sram1_wr_limit_stall : 1; + uint32_t axi_sram2_ar_stalled : 1; + uint32_t axi_sram2_rd_limit_stall : 1; + uint32_t axi_sram2_aw_stalled : 1; + uint32_t axi_sram2_w_stalled : 1; + uint32_t axi_sram2_wr_limit_stall : 1; + uint32_t axi_sram3_ar_stalled : 1; + uint32_t axi_sram3_rd_limit_stall : 1; + uint32_t axi_sram3_aw_stalled : 1; + uint32_t axi_sram3_w_stalled : 1; + uint32_t axi_sram3_wr_limit_stall : 1; + uint32_t axi_ext0_ar_stalled : 1; + uint32_t axi_ext0_rd_limit_stall : 1; + uint32_t axi_ext0_aw_stalled : 1; + uint32_t axi_ext0_w_stalled : 1; + uint32_t axi_ext0_wr_limit_stall : 1; + uint32_t axi_ext1_ar_stalled : 1; + uint32_t axi_ext1_rd_limit_stall : 1; + uint32_t axi_ext1_aw_stalled : 1; + uint32_t axi_ext1_w_stalled : 1; + uint32_t axi_ext1_wr_limit_stall : 1; + uint32_t reserved0 : 2; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR dma_status1_r() : + word0(0) + {} + CONSTEXPR dma_status1_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + dma_status1_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_axi_sram0_ar_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram0_ar_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_axi_sram0_rd_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram0_rd_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_axi_sram0_aw_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram0_aw_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_axi_sram0_w_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram0_w_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_axi_sram0_wr_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram0_wr_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_axi_sram1_ar_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram1_ar_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_axi_sram1_rd_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram1_rd_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_axi_sram1_aw_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram1_aw_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_axi_sram1_w_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram1_w_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & value) << 8); + return *this; + } + CONSTEXPR uint32_t get_axi_sram1_wr_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 9); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram1_wr_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<9) & word0) | ((((1U << 1) - 1) & value) << 9); + return *this; + } + CONSTEXPR uint32_t get_axi_sram2_ar_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram2_ar_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10); + return *this; + } + CONSTEXPR uint32_t get_axi_sram2_rd_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 11); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram2_rd_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<11) & word0) | ((((1U << 1) - 1) & value) << 11); + return *this; + } + CONSTEXPR uint32_t get_axi_sram2_aw_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 12); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram2_aw_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & value) << 12); + return *this; + } + CONSTEXPR uint32_t get_axi_sram2_w_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 13); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram2_w_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & value) << 13); + return *this; + } + CONSTEXPR uint32_t get_axi_sram2_wr_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 14); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram2_wr_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & value) << 14); + return *this; + } + CONSTEXPR uint32_t get_axi_sram3_ar_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 15); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram3_ar_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & value) << 15); + return *this; + } + CONSTEXPR uint32_t get_axi_sram3_rd_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram3_rd_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<16) & word0) | ((((1U << 1) - 1) & value) << 16); + return *this; + } + CONSTEXPR uint32_t get_axi_sram3_aw_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 17); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram3_aw_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<17) & word0) | ((((1U << 1) - 1) & value) << 17); + return *this; + } + CONSTEXPR uint32_t get_axi_sram3_w_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 18); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram3_w_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<18) & word0) | ((((1U << 1) - 1) & value) << 18); + return *this; + } + CONSTEXPR uint32_t get_axi_sram3_wr_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 19); + return v; + } + CONSTEXPR dma_status1_r& set_axi_sram3_wr_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<19) & word0) | ((((1U << 1) - 1) & value) << 19); + return *this; + } + CONSTEXPR uint32_t get_axi_ext0_ar_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 20); + return v; + } + CONSTEXPR dma_status1_r& set_axi_ext0_ar_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<20) & word0) | ((((1U << 1) - 1) & value) << 20); + return *this; + } + CONSTEXPR uint32_t get_axi_ext0_rd_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 21); + return v; + } + CONSTEXPR dma_status1_r& set_axi_ext0_rd_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<21) & word0) | ((((1U << 1) - 1) & value) << 21); + return *this; + } + CONSTEXPR uint32_t get_axi_ext0_aw_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 22); + return v; + } + CONSTEXPR dma_status1_r& set_axi_ext0_aw_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<22) & word0) | ((((1U << 1) - 1) & value) << 22); + return *this; + } + CONSTEXPR uint32_t get_axi_ext0_w_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 23); + return v; + } + CONSTEXPR dma_status1_r& set_axi_ext0_w_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<23) & word0) | ((((1U << 1) - 1) & value) << 23); + return *this; + } + CONSTEXPR uint32_t get_axi_ext0_wr_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 24); + return v; + } + CONSTEXPR dma_status1_r& set_axi_ext0_wr_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<24) & word0) | ((((1U << 1) - 1) & value) << 24); + return *this; + } + CONSTEXPR uint32_t get_axi_ext1_ar_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 25); + return v; + } + CONSTEXPR dma_status1_r& set_axi_ext1_ar_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<25) & word0) | ((((1U << 1) - 1) & value) << 25); + return *this; + } + CONSTEXPR uint32_t get_axi_ext1_rd_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 26); + return v; + } + CONSTEXPR dma_status1_r& set_axi_ext1_rd_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<26) & word0) | ((((1U << 1) - 1) & value) << 26); + return *this; + } + CONSTEXPR uint32_t get_axi_ext1_aw_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 27); + return v; + } + CONSTEXPR dma_status1_r& set_axi_ext1_aw_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<27) & word0) | ((((1U << 1) - 1) & value) << 27); + return *this; + } + CONSTEXPR uint32_t get_axi_ext1_w_stalled() const + { + auto v = ((1U << 1) - 1) & (word0 >> 28); + return v; + } + CONSTEXPR dma_status1_r& set_axi_ext1_w_stalled(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<28) & word0) | ((((1U << 1) - 1) & value) << 28); + return *this; + } + CONSTEXPR uint32_t get_axi_ext1_wr_limit_stall() const + { + auto v = ((1U << 1) - 1) & (word0 >> 29); + return v; + } + CONSTEXPR dma_status1_r& set_axi_ext1_wr_limit_stall(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<29) & word0) | ((((1U << 1) - 1) & value) << 29); + return *this; + } +#endif +}; + + +struct pmcr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t cnt_en : 1; + uint32_t event_cnt_rst : 1; + uint32_t cycle_cnt_rst : 1; + uint32_t mask_en : 1; + uint32_t reserved0 : 7; + uint32_t num_event_cnt : 5; + uint32_t reserved1 : 16; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmcr_r() : + word0(16384) + {} + CONSTEXPR pmcr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmcr_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_cnt_en() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmcr_r& set_cnt_en(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_event_cnt_rst() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmcr_r& set_event_cnt_rst(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_cycle_cnt_rst() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmcr_r& set_cycle_cnt_rst(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_mask_en() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmcr_r& set_mask_en(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_num_event_cnt() const + { + auto v = ((1U << 5) - 1) & (word0 >> 11); + return v; + } + CONSTEXPR pmcr_r& set_num_event_cnt(uint32_t value) + { + word0 = (~(((1U << 5) - 1)<<11) & word0) | ((((1U << 5) - 1) & value) << 11); + return *this; + } +#endif +}; + + +struct pmcntenset_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0 : 1; + uint32_t EVENT_CNT_1 : 1; + uint32_t EVENT_CNT_2 : 1; + uint32_t EVENT_CNT_3 : 1; + uint32_t EVENT_CNT_4 : 1; + uint32_t EVENT_CNT_5 : 1; + uint32_t EVENT_CNT_6 : 1; + uint32_t EVENT_CNT_7 : 1; + uint32_t reserved0 : 23; + uint32_t CYCLE_CNT : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmcntenset_r() : + word0(0) + {} + CONSTEXPR pmcntenset_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmcntenset_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_2(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_3(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_4() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_4(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_5() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_5(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_6() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_6(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_7() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR pmcntenset_r& set_EVENT_CNT_7(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmcntenset_r& set_CYCLE_CNT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmcntenclr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0 : 1; + uint32_t EVENT_CNT_1 : 1; + uint32_t EVENT_CNT_2 : 1; + uint32_t EVENT_CNT_3 : 1; + uint32_t EVENT_CNT_4 : 1; + uint32_t EVENT_CNT_5 : 1; + uint32_t EVENT_CNT_6 : 1; + uint32_t EVENT_CNT_7 : 1; + uint32_t reserved0 : 23; + uint32_t CYCLE_CNT : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmcntenclr_r() : + word0(0) + {} + CONSTEXPR pmcntenclr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmcntenclr_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_1(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_2(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_3(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_4() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_4(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_5() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_5(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_6() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_6(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_7() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR pmcntenclr_r& set_EVENT_CNT_7(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmcntenclr_r& set_CYCLE_CNT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmovsset_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0_OVF : 1; + uint32_t EVENT_CNT_1_OVF : 1; + uint32_t EVENT_CNT_2_OVF : 1; + uint32_t EVENT_CNT_3_OVF : 1; + uint32_t EVENT_CNT_4_OVF : 1; + uint32_t EVENT_CNT_5_OVF : 1; + uint32_t EVENT_CNT_6_OVF : 1; + uint32_t EVENT_CNT_7_OVF : 1; + uint32_t reserved0 : 23; + uint32_t CYCLE_CNT_OVF : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmovsset_r() : + word0(0) + {} + CONSTEXPR pmovsset_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmovsset_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_0_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_1_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_2_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_3_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_4_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_4_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_5_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_5_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_6_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_6_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_7_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR pmovsset_r& set_EVENT_CNT_7_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmovsset_r& set_CYCLE_CNT_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmovsclr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0_OVF : 1; + uint32_t EVENT_CNT_1_OVF : 1; + uint32_t EVENT_CNT_2_OVF : 1; + uint32_t EVENT_CNT_3_OVF : 1; + uint32_t EVENT_CNT_4_OVF : 1; + uint32_t EVENT_CNT_5_OVF : 1; + uint32_t EVENT_CNT_6_OVF : 1; + uint32_t EVENT_CNT_7_OVF : 1; + uint32_t reserved0 : 23; + uint32_t CYCLE_CNT_OVF : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmovsclr_r() : + word0(0) + {} + CONSTEXPR pmovsclr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmovsclr_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_0_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_1_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_2_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_3_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_4_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_4_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_5_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_5_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_6_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_6_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_7_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR pmovsclr_r& set_EVENT_CNT_7_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT_OVF() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmovsclr_r& set_CYCLE_CNT_OVF(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmintset_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0_INT : 1; + uint32_t EVENT_CNT_1_INT : 1; + uint32_t EVENT_CNT_2_INT : 1; + uint32_t EVENT_CNT_3_INT : 1; + uint32_t EVENT_CNT_4_INT : 1; + uint32_t EVENT_CNT_5_INT : 1; + uint32_t EVENT_CNT_6_INT : 1; + uint32_t EVENT_CNT_7_INT : 1; + uint32_t reserved0 : 23; + uint32_t CYCLE_CNT_INT : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmintset_r() : + word0(0) + {} + CONSTEXPR pmintset_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmintset_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_0_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_1_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_2_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_3_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_4_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_4_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_5_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_5_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_6_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_6_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_7_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR pmintset_r& set_EVENT_CNT_7_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmintset_r& set_CYCLE_CNT_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmintclr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EVENT_CNT_0_INT : 1; + uint32_t EVENT_CNT_1_INT : 1; + uint32_t EVENT_CNT_2_INT : 1; + uint32_t EVENT_CNT_3_INT : 1; + uint32_t EVENT_CNT_4_INT : 1; + uint32_t EVENT_CNT_5_INT : 1; + uint32_t EVENT_CNT_6_INT : 1; + uint32_t EVENT_CNT_7_INT : 1; + uint32_t reserved0 : 23; + uint32_t CYCLE_CNT_INT : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmintclr_r() : + word0(0) + {} + CONSTEXPR pmintclr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmintclr_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_0_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_0_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_1_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 1); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_1_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<1) & word0) | ((((1U << 1) - 1) & value) << 1); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_2_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 2); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_2_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<2) & word0) | ((((1U << 1) - 1) & value) << 2); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_3_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 3); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_3_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<3) & word0) | ((((1U << 1) - 1) & value) << 3); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_4_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 4); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_4_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<4) & word0) | ((((1U << 1) - 1) & value) << 4); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_5_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 5); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_5_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<5) & word0) | ((((1U << 1) - 1) & value) << 5); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_6_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 6); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_6_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<6) & word0) | ((((1U << 1) - 1) & value) << 6); + return *this; + } + CONSTEXPR uint32_t get_EVENT_CNT_7_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 7); + return v; + } + CONSTEXPR pmintclr_r& set_EVENT_CNT_7_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<7) & word0) | ((((1U << 1) - 1) & value) << 7); + return *this; + } + CONSTEXPR uint32_t get_CYCLE_CNT_INT() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + return v; + } + CONSTEXPR pmintclr_r& set_CYCLE_CNT_INT(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & value) << 31); + return *this; + } +#endif +}; + + +struct pmccntr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CYCLE_CNT_LO : 32; + uint32_t CYCLE_CNT_HI : 16; + uint32_t reserved0 : 16; + }; + uint32_t word[2]; + }; +#else +private: + uint32_t word0; + uint32_t word1; +public: + CONSTEXPR pmccntr_r() : + word0(0), + word1(0) + {} + CONSTEXPR pmccntr_r(uint64_t init) : + word0(static_cast((init) & static_cast(std::numeric_limits::max()))), + word1(static_cast((init >> 32) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR void operator=(uint64_t value) + { + word0 = static_cast((value) & static_cast(std::numeric_limits::max())); word1 = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); + } + CONSTEXPR operator uint64_t() + { + return (static_cast(word1) << 32) | word0; + } + pmccntr_r copy() + { + return *this; + } +#endif +}; + + +struct pmccntr_cfg_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CYCLE_CNT_CFG_START : 10; + uint32_t reserved0 : 2; + uint32_t S0 : 1; + uint32_t S1 : 1; + uint32_t S2 : 1; + uint32_t S3 : 1; + uint32_t CYCLE_CNT_CFG_STOP : 10; + uint32_t reserved1 : 2; + uint32_t E0 : 1; + uint32_t E1 : 1; + uint32_t E2 : 1; + uint32_t E3 : 1; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmccntr_cfg_r() : + word0(0) + {} + CONSTEXPR pmccntr_cfg_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmccntr_cfg_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_event get_CYCLE_CNT_CFG_START() const + { + auto v = ((1U << 10) - 1) & (word0 >> 0); + assert(v <= 399); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_CYCLE_CNT_CFG_START(NPU_NAMESPACE::pmu_event value) + { + word0 = (~(((1U << 10) - 1)<<0) & word0) | ((((1U << 10) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_S0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 12); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_S0(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 12); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_S1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 13); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_S1(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 13); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_S2() const + { + auto v = ((1U << 1) - 1) & (word0 >> 14); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_S2(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 14); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_S3() const + { + auto v = ((1U << 1) - 1) & (word0 >> 15); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_S3(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 15); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_event get_CYCLE_CNT_CFG_STOP() const + { + auto v = ((1U << 10) - 1) & (word0 >> 16); + assert(v <= 399); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_CYCLE_CNT_CFG_STOP(NPU_NAMESPACE::pmu_event value) + { + word0 = (~(((1U << 10) - 1)<<16) & word0) | ((((1U << 10) - 1) & static_cast(value)) << 16); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_E0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 28); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_E0(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<28) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 28); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_E1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 29); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_E1(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<29) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 29); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_E2() const + { + auto v = ((1U << 1) - 1) & (word0 >> 30); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_E2(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<30) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 30); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_E3() const + { + auto v = ((1U << 1) - 1) & (word0 >> 31); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmccntr_cfg_r& set_E3(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<31) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 31); + return *this; + } +#endif +}; + + +struct pmcaxi_chan_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t CH_SEL : 4; + uint32_t reserved0 : 4; + uint32_t AXI_SEL : 1; + uint32_t reserved1 : 1; + uint32_t BW_CH_SEL_EN : 1; + uint32_t reserved2 : 21; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmcaxi_chan_r() : + word0(0) + {} + CONSTEXPR pmcaxi_chan_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmcaxi_chan_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_axi_channel get_CH_SEL() const + { + auto v = ((1U << 4) - 1) & (word0 >> 0); + assert(v <= 9); + return static_cast(v); + } + CONSTEXPR pmcaxi_chan_r& set_CH_SEL(NPU_NAMESPACE::pmu_axi_channel value) + { + word0 = (~(((1U << 4) - 1)<<0) & word0) | ((((1U << 4) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::axi_port get_AXI_SEL() const + { + auto v = ((1U << 1) - 1) & (word0 >> 8); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmcaxi_chan_r& set_AXI_SEL(NPU_NAMESPACE::axi_port value) + { + word0 = (~(((1U << 1) - 1)<<8) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 8); + return *this; + } + CONSTEXPR uint32_t get_BW_CH_SEL_EN() const + { + auto v = ((1U << 1) - 1) & (word0 >> 10); + return v; + } + CONSTEXPR pmcaxi_chan_r& set_BW_CH_SEL_EN(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<10) & word0) | ((((1U << 1) - 1) & value) << 10); + return *this; + } +#endif +}; + + +struct pmclut_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t PCM_LUT_EN_0 : 1; + uint32_t reserved0 : 15; + uint32_t PMC_LUT_0 : 16; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmclut_r() : + word0(0) + {} + CONSTEXPR pmclut_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmclut_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_PCM_LUT_EN_0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 0); + return v; + } + CONSTEXPR pmclut_r& set_PCM_LUT_EN_0(uint32_t value) + { + word0 = (~(((1U << 1) - 1)<<0) & word0) | ((((1U << 1) - 1) & value) << 0); + return *this; + } + CONSTEXPR uint32_t get_PMC_LUT_0() const + { + auto v = ((1U << 16) - 1) & (word0 >> 16); + return v; + } + CONSTEXPR pmclut_r& set_PMC_LUT_0(uint32_t value) + { + word0 = (~(((1U << 16) - 1)<<16) & word0) | ((((1U << 16) - 1) & value) << 16); + return *this; + } +#endif +}; + + +struct pmevcntr_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t count : 32; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmevcntr_r() : + word0(0) + {} + CONSTEXPR pmevcntr_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmevcntr_r copy() + { + return *this; + } + CONSTEXPR uint32_t get_count() const + { + auto v = word0; + return v; + } + CONSTEXPR pmevcntr_r& set_count(uint32_t value) + { + word0 = value; + return *this; + } +#endif +}; + + +struct pmevtyper_r +{ +#ifndef __cplusplus + union + { + struct + { + uint32_t EV_TYPE : 10; + uint32_t reserved0 : 2; + uint32_t D0 : 1; + uint32_t D1 : 1; + uint32_t D2 : 1; + uint32_t D3 : 1; + uint32_t reserved1 : 16; + }; + uint32_t word; + }; +#else +private: + uint32_t word0; +public: + CONSTEXPR pmevtyper_r() : + word0(0) + {} + CONSTEXPR pmevtyper_r(uint32_t init) : + word0(init) + {} + CONSTEXPR void operator=(uint32_t value) + { + word0 = value; + } + CONSTEXPR operator uint32_t() + { + return word0; + } + pmevtyper_r copy() + { + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_event get_EV_TYPE() const + { + auto v = ((1U << 10) - 1) & (word0 >> 0); + assert(v <= 399); + return static_cast(v); + } + CONSTEXPR pmevtyper_r& set_EV_TYPE(NPU_NAMESPACE::pmu_event value) + { + word0 = (~(((1U << 10) - 1)<<0) & word0) | ((((1U << 10) - 1) & static_cast(value)) << 0); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_D0() const + { + auto v = ((1U << 1) - 1) & (word0 >> 12); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmevtyper_r& set_D0(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<12) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 12); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_D1() const + { + auto v = ((1U << 1) - 1) & (word0 >> 13); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmevtyper_r& set_D1(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<13) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 13); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_D2() const + { + auto v = ((1U << 1) - 1) & (word0 >> 14); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmevtyper_r& set_D2(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<14) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 14); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pmu_port_disable get_D3() const + { + auto v = ((1U << 1) - 1) & (word0 >> 15); + assert(v <= 1); + return static_cast(v); + } + CONSTEXPR pmevtyper_r& set_D3(NPU_NAMESPACE::pmu_port_disable value) + { + word0 = (~(((1U << 1) - 1)<<15) & word0) | ((((1U << 1) - 1) & static_cast(value)) << 15); + return *this; + } +#endif +}; + +struct NPU_REG +{ + STRUCT id_r ID; + STRUCT status_r STATUS; + STRUCT cmd_r CMD; + STRUCT reset_r RESET; + STRUCT qbase_r QBASE; + STRUCT qread_r QREAD; + STRUCT qconfig_r QCONFIG; + STRUCT qsize_r QSIZE; + STRUCT prot_r PROT; + STRUCT config_r CONFIG; + uint32_t unused0[1]; + STRUCT cond_status_r COND_STATUS; + uint32_t unused1[1]; + STRUCT power_ctrl_r POWER_CTRL; + STRUCT regioncfg_r REGIONCFG; + STRUCT mem_attr_r MEM_ATTR[4]; + STRUCT axi_sram_r AXI_SRAM; + STRUCT axi_ext_r AXI_EXT; + uint32_t unused2[2]; + STRUCT cfg_sram_cap_r CFG_SRAM_CAP; + STRUCT cfg_ext_cap_r CFG_EXT_CAP; + STRUCT cfg_sram_hash0_r CFG_SRAM_HASH0; + STRUCT cfg_sram_hash1_r CFG_SRAM_HASH1; + STRUCT cfg_ext_hash0_r CFG_EXT_HASH0; + STRUCT basep_r BASEP[8]; + uint32_t unused3[32]; + STRUCT clkforce_r CLKFORCE; + STRUCT debug_address_r DEBUG_ADDRESS; + STRUCT debug_misc_r DEBUG_MISC; + uint32_t unused4[61]; + STRUCT dma_ifm_src_r DMA_IFM_SRC; + STRUCT dma_ifm_dst_r DMA_IFM_DST; + STRUCT dma_ofm_src_r DMA_OFM_SRC; + STRUCT dma_ofm_dst_r DMA_OFM_DST; + STRUCT dma_weight_src_r DMA_WEIGHT_SRC; + STRUCT dma_cmd_src_r DMA_CMD_SRC; + STRUCT dma_cmd_size_r DMA_CMD_SIZE; + STRUCT dma_m2m_src_r DMA_M2M_SRC; + STRUCT dma_m2m_dst_r DMA_M2M_DST; + STRUCT current_qread_r CURRENT_QREAD; + STRUCT dma_scale_src_r DMA_SCALE_SRC; + STRUCT dma_weight1_src_r DMA_WEIGHT1_SRC; + STRUCT dma_weight2_src_r DMA_WEIGHT2_SRC; + STRUCT dma_weight3_src_r DMA_WEIGHT3_SRC; + uint32_t unused5[6]; + STRUCT current_op_r CURRENT_OP; + STRUCT current_cmd_r CURRENT_CMD; + uint32_t unused6[80]; + STRUCT internal_memory_r INTERNAL_MEMORY[256]; + STRUCT ifm_pad_top_r IFM_PAD_TOP; + STRUCT ifm_pad_left_r IFM_PAD_LEFT; + STRUCT ifm_pad_right_r IFM_PAD_RIGHT; + STRUCT ifm_pad_bottom_r IFM_PAD_BOTTOM; + STRUCT ifm_depth_m1_r IFM_DEPTH_M1; + STRUCT ifm_precision_r IFM_PRECISION; + uint32_t unused7[1]; + STRUCT ifm_upscale_r IFM_UPSCALE; + STRUCT ifm_broadcast_r IFM_BROADCAST; + STRUCT ifm_zero_point_r IFM_ZERO_POINT; + STRUCT ifm_width0_m1_r IFM_WIDTH0_M1; + STRUCT ifm_height0_m1_r IFM_HEIGHT0_M1; + STRUCT ifm_height1_m1_r IFM_HEIGHT1_M1; + uint32_t unused8[2]; + STRUCT ifm_region_r IFM_REGION; + uint32_t unused9[1]; + STRUCT ofm_width_m1_r OFM_WIDTH_M1; + STRUCT ofm_height_m1_r OFM_HEIGHT_M1; + STRUCT ofm_depth_m1_r OFM_DEPTH_M1; + STRUCT ofm_precision_r OFM_PRECISION; + STRUCT ofm_blk_width_m1_r OFM_BLK_WIDTH_M1; + STRUCT ofm_blk_height_m1_r OFM_BLK_HEIGHT_M1; + STRUCT ofm_blk_depth_m1_r OFM_BLK_DEPTH_M1; + STRUCT ofm_zero_point_r OFM_ZERO_POINT; + uint32_t unused10[1]; + STRUCT ofm_width0_m1_r OFM_WIDTH0_M1; + STRUCT ofm_height0_m1_r OFM_HEIGHT0_M1; + STRUCT ofm_height1_m1_r OFM_HEIGHT1_M1; + uint32_t unused11[2]; + STRUCT ofm_region_r OFM_REGION; + STRUCT kernel_width_m1_r KERNEL_WIDTH_M1; + STRUCT kernel_height_m1_r KERNEL_HEIGHT_M1; + STRUCT kernel_stride_r KERNEL_STRIDE; + uint32_t unused12[1]; + STRUCT acc_format_r ACC_FORMAT; + STRUCT activation_r ACTIVATION; + STRUCT activation_min_r ACTIVATION_MIN; + STRUCT activation_max_r ACTIVATION_MAX; + STRUCT weight_region_r WEIGHT_REGION; + STRUCT scale_region_r SCALE_REGION; + STRUCT resize_x_scale_n_m1_r RESIZE_X_SCALE_N_M1; + STRUCT resize_y_scale_n_m1_r RESIZE_Y_SCALE_N_M1; + STRUCT resize_x_offset_r RESIZE_X_OFFSET; + STRUCT resize_y_offset_r RESIZE_Y_OFFSET; + STRUCT weight_format_r WEIGHT_FORMAT; + STRUCT blockdep_r BLOCKDEP; + STRUCT dma0_src_region_r DMA0_SRC_REGION; + STRUCT dma0_dst_region_r DMA0_DST_REGION; + STRUCT dma0_size0_r DMA0_SIZE0; + STRUCT dma0_size1_r DMA0_SIZE1; + STRUCT dma0_idx_region_r DMA0_IDX_REGION; + uint32_t unused13[11]; + STRUCT ifm2_broadcast_r IFM2_BROADCAST; + uint32_t unused14[4]; + STRUCT ifm2_precision_r IFM2_PRECISION; + uint32_t unused15[3]; + STRUCT ifm2_zero_point_r IFM2_ZERO_POINT; + STRUCT ifm2_width0_m1_r IFM2_WIDTH0_M1; + STRUCT ifm2_height0_m1_r IFM2_HEIGHT0_M1; + STRUCT ifm2_height1_m1_r IFM2_HEIGHT1_M1; + uint32_t unused16[2]; + STRUCT ifm2_region_r IFM2_REGION; + uint32_t unused17[48]; + STRUCT ifm_base0_r IFM_BASE0; + STRUCT ifm_base1_r IFM_BASE1; + STRUCT ifm_base2_r IFM_BASE2; + STRUCT ifm_base3_r IFM_BASE3; + STRUCT ifm_stride_x_r IFM_STRIDE_X; + STRUCT ifm_stride_y_r IFM_STRIDE_Y; + STRUCT ifm_stride_c_r IFM_STRIDE_C; + uint32_t unused18[2]; + STRUCT ofm_base0_r OFM_BASE0; + STRUCT ofm_base1_r OFM_BASE1; + STRUCT ofm_base2_r OFM_BASE2; + STRUCT ofm_base3_r OFM_BASE3; + STRUCT ofm_stride_x_r OFM_STRIDE_X; + STRUCT ofm_stride_y_r OFM_STRIDE_Y; + STRUCT ofm_stride_c_r OFM_STRIDE_C; + uint32_t unused19[2]; + STRUCT weight_base_r WEIGHT_BASE; + STRUCT weight_length_r WEIGHT_LENGTH; + STRUCT scale_base_r SCALE_BASE; + STRUCT scale_length_r SCALE_LENGTH; + STRUCT ofm_scale_r OFM_SCALE; + STRUCT ifm_scale_r IFM_SCALE; + STRUCT ifm2_scale_r IFM2_SCALE; + STRUCT op_scalar_r OP_SCALAR; + STRUCT dma0_src_r DMA0_SRC; + STRUCT dma0_dst_r DMA0_DST; + STRUCT dma0_len_r DMA0_LEN; + STRUCT dma0_src_stride0_r DMA0_SRC_STRIDE0; + STRUCT dma0_src_stride1_r DMA0_SRC_STRIDE1; + STRUCT dma0_dst_stride0_r DMA0_DST_STRIDE0; + STRUCT dma0_dst_stride1_r DMA0_DST_STRIDE1; + STRUCT dma0_idx_r DMA0_IDX; + STRUCT ifm2_base0_r IFM2_BASE0; + STRUCT ifm2_base1_r IFM2_BASE1; + STRUCT ifm2_base2_r IFM2_BASE2; + STRUCT ifm2_base3_r IFM2_BASE3; + STRUCT ifm2_stride_x_r IFM2_STRIDE_X; + STRUCT ifm2_stride_y_r IFM2_STRIDE_Y; + STRUCT ifm2_stride_c_r IFM2_STRIDE_C; + uint32_t unused20[2]; + STRUCT weight1_base_r WEIGHT1_BASE; + STRUCT weight1_length_r WEIGHT1_LENGTH; + STRUCT weight2_base_r WEIGHT2_BASE; + STRUCT weight2_length_r WEIGHT2_LENGTH; + STRUCT weight3_base_r WEIGHT3_BASE; + STRUCT weight3_length_r WEIGHT3_LENGTH; + STRUCT resize_x_step_r RESIZE_X_STEP; + STRUCT resize_y_step_r RESIZE_Y_STEP; + uint32_t unused21[16]; + STRUCT dma0_idx_max_r DMA0_IDX_MAX; + STRUCT dma0_idx_skip1_r DMA0_IDX_SKIP1; + uint32_t unused22[252]; + STRUCT revision_r REVISION; + uint32_t unused23[3]; + STRUCT pid4_r PID4; + STRUCT pid5_r PID5; + STRUCT pid6_r PID6; + STRUCT pid7_r PID7; + STRUCT pid0_r PID0; + STRUCT pid1_r PID1; + STRUCT pid2_r PID2; + STRUCT pid3_r PID3; + STRUCT cid0_r CID0; + STRUCT cid1_r CID1; + STRUCT cid2_r CID2; + STRUCT cid3_r CID3; + uint32_t unused24[64]; + STRUCT wd_status_r WD_STATUS; + STRUCT mac_status_r MAC_STATUS; + STRUCT ao_status_r AO_STATUS; + uint32_t unused25[1]; + STRUCT dma_status0_r DMA_STATUS0; + STRUCT dma_status1_r DMA_STATUS1; + uint32_t unused26[26]; + STRUCT pmcr_r PMCR; + STRUCT pmcntenset_r PMCNTENSET; + STRUCT pmcntenclr_r PMCNTENCLR; + STRUCT pmovsset_r PMOVSSET; + STRUCT pmovsclr_r PMOVSCLR; + STRUCT pmintset_r PMINTSET; + STRUCT pmintclr_r PMINTCLR; + uint32_t unused27[1]; + STRUCT pmccntr_r PMCCNTR; + STRUCT pmccntr_cfg_r PMCCNTR_CFG; + STRUCT pmcaxi_chan_r PMCAXI_CHAN; + STRUCT pmclut_r PMCLUT; + uint32_t unused28[83]; + STRUCT pmevcntr_r PMEVCNTR[8]; + uint32_t unused29[24]; + STRUCT pmevtyper_r PMEVTYPER[8]; + +#ifdef __cplusplus + enum class access_type_t : uint8_t { RW, RO, WO }; + NPU_REG() + { + reset(); + } + void reset() + { + ID = 536899584; + STATUS = 8; + CMD = 12; + RESET = 0; + QBASE = 0; + QREAD = 0; + QCONFIG = 0; + QSIZE = 0; + PROT = 0; + CONFIG = 536870928; + COND_STATUS = 0; + POWER_CTRL = 0; + REGIONCFG = 0; + for (size_t i = 0; i < (sizeof(MEM_ATTR) / sizeof(MEM_ATTR[0])); ++i) + MEM_ATTR[i] = 0; + AXI_SRAM = 0; + AXI_EXT = 0; + CFG_SRAM_CAP = 0; + CFG_EXT_CAP = 0; + CFG_SRAM_HASH0 = 0; + CFG_SRAM_HASH1 = 0; + CFG_EXT_HASH0 = 0; + for (size_t i = 0; i < (sizeof(BASEP) / sizeof(BASEP[0])); ++i) + BASEP[i] = 0; + CLKFORCE = 0; + DEBUG_ADDRESS = 0; + DEBUG_MISC = 0; + DMA_IFM_SRC = 0; + DMA_IFM_DST = 0; + DMA_OFM_SRC = 0; + DMA_OFM_DST = 0; + DMA_WEIGHT_SRC = 0; + DMA_CMD_SRC = 0; + DMA_CMD_SIZE = 0; + DMA_M2M_SRC = 0; + DMA_M2M_DST = 0; + CURRENT_QREAD = 0; + DMA_SCALE_SRC = 0; + DMA_WEIGHT1_SRC = 0; + DMA_WEIGHT2_SRC = 0; + DMA_WEIGHT3_SRC = 0; + CURRENT_OP = 0; + CURRENT_CMD = 0; + for (size_t i = 0; i < (sizeof(INTERNAL_MEMORY) / sizeof(INTERNAL_MEMORY[0])); ++i) + INTERNAL_MEMORY[i] = 0; + IFM_PAD_TOP = 0; + IFM_PAD_LEFT = 0; + IFM_PAD_RIGHT = 0; + IFM_PAD_BOTTOM = 0; + IFM_DEPTH_M1 = 0; + IFM_PRECISION = 0; + IFM_UPSCALE = 0; + IFM_BROADCAST = 0; + IFM_ZERO_POINT = 0; + IFM_WIDTH0_M1 = 0; + IFM_HEIGHT0_M1 = 0; + IFM_HEIGHT1_M1 = 0; + IFM_REGION = 0; + OFM_WIDTH_M1 = 0; + OFM_HEIGHT_M1 = 0; + OFM_DEPTH_M1 = 0; + OFM_PRECISION = 0; + OFM_BLK_WIDTH_M1 = 0; + OFM_BLK_HEIGHT_M1 = 0; + OFM_BLK_DEPTH_M1 = 0; + OFM_ZERO_POINT = 0; + OFM_WIDTH0_M1 = 0; + OFM_HEIGHT0_M1 = 0; + OFM_HEIGHT1_M1 = 0; + OFM_REGION = 0; + KERNEL_WIDTH_M1 = 0; + KERNEL_HEIGHT_M1 = 0; + KERNEL_STRIDE = 0; + ACC_FORMAT = 0; + ACTIVATION = 0; + ACTIVATION_MIN = 0; + ACTIVATION_MAX = 0; + WEIGHT_REGION = 0; + SCALE_REGION = 0; + RESIZE_X_SCALE_N_M1 = 0; + RESIZE_Y_SCALE_N_M1 = 0; + RESIZE_X_OFFSET = 0; + RESIZE_Y_OFFSET = 0; + WEIGHT_FORMAT = 0; + BLOCKDEP = 0; + DMA0_SRC_REGION = 0; + DMA0_DST_REGION = 0; + DMA0_SIZE0 = 0; + DMA0_SIZE1 = 0; + DMA0_IDX_REGION = 0; + IFM2_BROADCAST = 0; + IFM2_PRECISION = 0; + IFM2_ZERO_POINT = 0; + IFM2_WIDTH0_M1 = 0; + IFM2_HEIGHT0_M1 = 0; + IFM2_HEIGHT1_M1 = 0; + IFM2_REGION = 0; + IFM_BASE0 = 0; + IFM_BASE1 = 0; + IFM_BASE2 = 0; + IFM_BASE3 = 0; + IFM_STRIDE_X = 0; + IFM_STRIDE_Y = 0; + IFM_STRIDE_C = 0; + OFM_BASE0 = 0; + OFM_BASE1 = 0; + OFM_BASE2 = 0; + OFM_BASE3 = 0; + OFM_STRIDE_X = 0; + OFM_STRIDE_Y = 0; + OFM_STRIDE_C = 0; + WEIGHT_BASE = 0; + WEIGHT_LENGTH = 0; + SCALE_BASE = 0; + SCALE_LENGTH = 0; + OFM_SCALE = 0; + IFM_SCALE = 0; + IFM2_SCALE = 0; + OP_SCALAR = 0; + DMA0_SRC = 0; + DMA0_DST = 0; + DMA0_LEN = 0; + DMA0_SRC_STRIDE0 = 0; + DMA0_SRC_STRIDE1 = 0; + DMA0_DST_STRIDE0 = 0; + DMA0_DST_STRIDE1 = 0; + DMA0_IDX = 0; + IFM2_BASE0 = 0; + IFM2_BASE1 = 0; + IFM2_BASE2 = 0; + IFM2_BASE3 = 0; + IFM2_STRIDE_X = 0; + IFM2_STRIDE_Y = 0; + IFM2_STRIDE_C = 0; + WEIGHT1_BASE = 0; + WEIGHT1_LENGTH = 0; + WEIGHT2_BASE = 0; + WEIGHT2_LENGTH = 0; + WEIGHT3_BASE = 0; + WEIGHT3_LENGTH = 0; + RESIZE_X_STEP = 0; + RESIZE_Y_STEP = 0; + DMA0_IDX_MAX = 0; + DMA0_IDX_SKIP1 = 0; + REVISION = 0; + PID4 = 4; + PID5 = 0; + PID6 = 0; + PID7 = 0; + PID0 = 130; + PID1 = 181; + PID2 = 11; + PID3 = 0; + CID0 = 13; + CID1 = 240; + CID2 = 5; + CID3 = 177; + WD_STATUS = 0; + MAC_STATUS = 0; + AO_STATUS = 0; + DMA_STATUS0 = 0; + DMA_STATUS1 = 0; + PMCR = 16384; + PMCNTENSET = 0; + PMCNTENCLR = 0; + PMOVSSET = 0; + PMOVSCLR = 0; + PMINTSET = 0; + PMINTCLR = 0; + PMCCNTR = 0; + PMCCNTR_CFG = 0; + PMCAXI_CHAN = 0; + PMCLUT = 0; + for (size_t i = 0; i < (sizeof(PMEVCNTR) / sizeof(PMEVCNTR[0])); ++i) + PMEVCNTR[i] = 0; + for (size_t i = 0; i < (sizeof(PMEVTYPER) / sizeof(PMEVTYPER[0])); ++i) + PMEVTYPER[i] = 0; + } + uint32_t& operator[](const int addr_offset) + { + return reinterpret_cast(this)[addr_offset / 4]; + } + access_type_t get_access_type(uint32_t offset) + { + switch (offset) + { + case 0: return access_type_t::RO; + case 4: return access_type_t::RO; + case 8: return access_type_t::RW; + case 12: return access_type_t::RW; + case 16: return access_type_t::RW; + case 24: return access_type_t::RO; + case 28: return access_type_t::RW; + case 32: return access_type_t::RW; + case 36: return access_type_t::RO; + case 40: return access_type_t::RO; + case 48: return access_type_t::RW; + case 56: return access_type_t::RW; + case 60: return access_type_t::RW; + case 64: return access_type_t::RW; + case 68: return access_type_t::RW; + case 72: return access_type_t::RW; + case 76: return access_type_t::RW; + case 80: return access_type_t::RW; + case 84: return access_type_t::RW; + case 96: return access_type_t::RO; + case 100: return access_type_t::RO; + case 104: return access_type_t::RO; + case 112: return access_type_t::RO; + case 120: return access_type_t::RO; + case 128: return access_type_t::RW; + case 136: return access_type_t::RW; + case 144: return access_type_t::RW; + case 152: return access_type_t::RW; + case 160: return access_type_t::RW; + case 168: return access_type_t::RW; + case 176: return access_type_t::RW; + case 184: return access_type_t::RW; + case 320: return access_type_t::RW; + case 324: return access_type_t::RW; + case 328: return access_type_t::RW; + case 576: return access_type_t::RO; + case 584: return access_type_t::RO; + case 588: return access_type_t::RO; + case 592: return access_type_t::RO; + case 600: return access_type_t::RO; + case 608: return access_type_t::RO; + case 616: return access_type_t::RO; + case 620: return access_type_t::RO; + case 628: return access_type_t::RO; + case 636: return access_type_t::RO; + case 640: return access_type_t::RO; + case 648: return access_type_t::RO; + case 656: return access_type_t::RO; + case 664: return access_type_t::RO; + case 696: return access_type_t::RO; + case 700: return access_type_t::RO; + case 1024: return access_type_t::RW; + case 1028: return access_type_t::RW; + case 1032: return access_type_t::RW; + case 1036: return access_type_t::RW; + case 1040: return access_type_t::RW; + case 1044: return access_type_t::RW; + case 1048: return access_type_t::RW; + case 1052: return access_type_t::RW; + case 1056: return access_type_t::RW; + case 1060: return access_type_t::RW; + case 1064: return access_type_t::RW; + case 1068: return access_type_t::RW; + case 1072: return access_type_t::RW; + case 1076: return access_type_t::RW; + case 1080: return access_type_t::RW; + case 1084: return access_type_t::RW; + case 1088: return access_type_t::RW; + case 1092: return access_type_t::RW; + case 1096: return access_type_t::RW; + case 1100: return access_type_t::RW; + case 1104: return access_type_t::RW; + case 1108: return access_type_t::RW; + case 1112: return access_type_t::RW; + case 1116: return access_type_t::RW; + case 1120: return access_type_t::RW; + case 1124: return access_type_t::RW; + case 1128: return access_type_t::RW; + case 1132: return access_type_t::RW; + case 1136: return access_type_t::RW; + case 1140: return access_type_t::RW; + case 1144: return access_type_t::RW; + case 1148: return access_type_t::RW; + case 1152: return access_type_t::RW; + case 1156: return access_type_t::RW; + case 1160: return access_type_t::RW; + case 1164: return access_type_t::RW; + case 1168: return access_type_t::RW; + case 1172: return access_type_t::RW; + case 1176: return access_type_t::RW; + case 1180: return access_type_t::RW; + case 1184: return access_type_t::RW; + case 1188: return access_type_t::RW; + case 1192: return access_type_t::RW; + case 1196: return access_type_t::RW; + case 1200: return access_type_t::RW; + case 1204: return access_type_t::RW; + case 1208: return access_type_t::RW; + case 1212: return access_type_t::RW; + case 1216: return access_type_t::RW; + case 1220: return access_type_t::RW; + case 1224: return access_type_t::RW; + case 1228: return access_type_t::RW; + case 1232: return access_type_t::RW; + case 1236: return access_type_t::RW; + case 1240: return access_type_t::RW; + case 1244: return access_type_t::RW; + case 1248: return access_type_t::RW; + case 1252: return access_type_t::RW; + case 1256: return access_type_t::RW; + case 1260: return access_type_t::RW; + case 1264: return access_type_t::RW; + case 1268: return access_type_t::RW; + case 1272: return access_type_t::RW; + case 1276: return access_type_t::RW; + case 1280: return access_type_t::RW; + case 1284: return access_type_t::RW; + case 1288: return access_type_t::RW; + case 1292: return access_type_t::RW; + case 1296: return access_type_t::RW; + case 1300: return access_type_t::RW; + case 1304: return access_type_t::RW; + case 1308: return access_type_t::RW; + case 1312: return access_type_t::RW; + case 1316: return access_type_t::RW; + case 1320: return access_type_t::RW; + case 1324: return access_type_t::RW; + case 1328: return access_type_t::RW; + case 1332: return access_type_t::RW; + case 1336: return access_type_t::RW; + case 1340: return access_type_t::RW; + case 1344: return access_type_t::RW; + case 1348: return access_type_t::RW; + case 1352: return access_type_t::RW; + case 1356: return access_type_t::RW; + case 1360: return access_type_t::RW; + case 1364: return access_type_t::RW; + case 1368: return access_type_t::RW; + case 1372: return access_type_t::RW; + case 1376: return access_type_t::RW; + case 1380: return access_type_t::RW; + case 1384: return access_type_t::RW; + case 1388: return access_type_t::RW; + case 1392: return access_type_t::RW; + case 1396: return access_type_t::RW; + case 1400: return access_type_t::RW; + case 1404: return access_type_t::RW; + case 1408: return access_type_t::RW; + case 1412: return access_type_t::RW; + case 1416: return access_type_t::RW; + case 1420: return access_type_t::RW; + case 1424: return access_type_t::RW; + case 1428: return access_type_t::RW; + case 1432: return access_type_t::RW; + case 1436: return access_type_t::RW; + case 1440: return access_type_t::RW; + case 1444: return access_type_t::RW; + case 1448: return access_type_t::RW; + case 1452: return access_type_t::RW; + case 1456: return access_type_t::RW; + case 1460: return access_type_t::RW; + case 1464: return access_type_t::RW; + case 1468: return access_type_t::RW; + case 1472: return access_type_t::RW; + case 1476: return access_type_t::RW; + case 1480: return access_type_t::RW; + case 1484: return access_type_t::RW; + case 1488: return access_type_t::RW; + case 1492: return access_type_t::RW; + case 1496: return access_type_t::RW; + case 1500: return access_type_t::RW; + case 1504: return access_type_t::RW; + case 1508: return access_type_t::RW; + case 1512: return access_type_t::RW; + case 1516: return access_type_t::RW; + case 1520: return access_type_t::RW; + case 1524: return access_type_t::RW; + case 1528: return access_type_t::RW; + case 1532: return access_type_t::RW; + case 1536: return access_type_t::RW; + case 1540: return access_type_t::RW; + case 1544: return access_type_t::RW; + case 1548: return access_type_t::RW; + case 1552: return access_type_t::RW; + case 1556: return access_type_t::RW; + case 1560: return access_type_t::RW; + case 1564: return access_type_t::RW; + case 1568: return access_type_t::RW; + case 1572: return access_type_t::RW; + case 1576: return access_type_t::RW; + case 1580: return access_type_t::RW; + case 1584: return access_type_t::RW; + case 1588: return access_type_t::RW; + case 1592: return access_type_t::RW; + case 1596: return access_type_t::RW; + case 1600: return access_type_t::RW; + case 1604: return access_type_t::RW; + case 1608: return access_type_t::RW; + case 1612: return access_type_t::RW; + case 1616: return access_type_t::RW; + case 1620: return access_type_t::RW; + case 1624: return access_type_t::RW; + case 1628: return access_type_t::RW; + case 1632: return access_type_t::RW; + case 1636: return access_type_t::RW; + case 1640: return access_type_t::RW; + case 1644: return access_type_t::RW; + case 1648: return access_type_t::RW; + case 1652: return access_type_t::RW; + case 1656: return access_type_t::RW; + case 1660: return access_type_t::RW; + case 1664: return access_type_t::RW; + case 1668: return access_type_t::RW; + case 1672: return access_type_t::RW; + case 1676: return access_type_t::RW; + case 1680: return access_type_t::RW; + case 1684: return access_type_t::RW; + case 1688: return access_type_t::RW; + case 1692: return access_type_t::RW; + case 1696: return access_type_t::RW; + case 1700: return access_type_t::RW; + case 1704: return access_type_t::RW; + case 1708: return access_type_t::RW; + case 1712: return access_type_t::RW; + case 1716: return access_type_t::RW; + case 1720: return access_type_t::RW; + case 1724: return access_type_t::RW; + case 1728: return access_type_t::RW; + case 1732: return access_type_t::RW; + case 1736: return access_type_t::RW; + case 1740: return access_type_t::RW; + case 1744: return access_type_t::RW; + case 1748: return access_type_t::RW; + case 1752: return access_type_t::RW; + case 1756: return access_type_t::RW; + case 1760: return access_type_t::RW; + case 1764: return access_type_t::RW; + case 1768: return access_type_t::RW; + case 1772: return access_type_t::RW; + case 1776: return access_type_t::RW; + case 1780: return access_type_t::RW; + case 1784: return access_type_t::RW; + case 1788: return access_type_t::RW; + case 1792: return access_type_t::RW; + case 1796: return access_type_t::RW; + case 1800: return access_type_t::RW; + case 1804: return access_type_t::RW; + case 1808: return access_type_t::RW; + case 1812: return access_type_t::RW; + case 1816: return access_type_t::RW; + case 1820: return access_type_t::RW; + case 1824: return access_type_t::RW; + case 1828: return access_type_t::RW; + case 1832: return access_type_t::RW; + case 1836: return access_type_t::RW; + case 1840: return access_type_t::RW; + case 1844: return access_type_t::RW; + case 1848: return access_type_t::RW; + case 1852: return access_type_t::RW; + case 1856: return access_type_t::RW; + case 1860: return access_type_t::RW; + case 1864: return access_type_t::RW; + case 1868: return access_type_t::RW; + case 1872: return access_type_t::RW; + case 1876: return access_type_t::RW; + case 1880: return access_type_t::RW; + case 1884: return access_type_t::RW; + case 1888: return access_type_t::RW; + case 1892: return access_type_t::RW; + case 1896: return access_type_t::RW; + case 1900: return access_type_t::RW; + case 1904: return access_type_t::RW; + case 1908: return access_type_t::RW; + case 1912: return access_type_t::RW; + case 1916: return access_type_t::RW; + case 1920: return access_type_t::RW; + case 1924: return access_type_t::RW; + case 1928: return access_type_t::RW; + case 1932: return access_type_t::RW; + case 1936: return access_type_t::RW; + case 1940: return access_type_t::RW; + case 1944: return access_type_t::RW; + case 1948: return access_type_t::RW; + case 1952: return access_type_t::RW; + case 1956: return access_type_t::RW; + case 1960: return access_type_t::RW; + case 1964: return access_type_t::RW; + case 1968: return access_type_t::RW; + case 1972: return access_type_t::RW; + case 1976: return access_type_t::RW; + case 1980: return access_type_t::RW; + case 1984: return access_type_t::RW; + case 1988: return access_type_t::RW; + case 1992: return access_type_t::RW; + case 1996: return access_type_t::RW; + case 2000: return access_type_t::RW; + case 2004: return access_type_t::RW; + case 2008: return access_type_t::RW; + case 2012: return access_type_t::RW; + case 2016: return access_type_t::RW; + case 2020: return access_type_t::RW; + case 2024: return access_type_t::RW; + case 2028: return access_type_t::RW; + case 2032: return access_type_t::RW; + case 2036: return access_type_t::RW; + case 2040: return access_type_t::RW; + case 2044: return access_type_t::RW; + case 2048: return access_type_t::RW; + case 2052: return access_type_t::RW; + case 2056: return access_type_t::RW; + case 2060: return access_type_t::RW; + case 2064: return access_type_t::RW; + case 2068: return access_type_t::RW; + case 2076: return access_type_t::RW; + case 2080: return access_type_t::RW; + case 2084: return access_type_t::RW; + case 2088: return access_type_t::RW; + case 2092: return access_type_t::RW; + case 2096: return access_type_t::RW; + case 2108: return access_type_t::RW; + case 2116: return access_type_t::RW; + case 2120: return access_type_t::RW; + case 2124: return access_type_t::RW; + case 2128: return access_type_t::RW; + case 2132: return access_type_t::RW; + case 2136: return access_type_t::RW; + case 2140: return access_type_t::RW; + case 2144: return access_type_t::RW; + case 2152: return access_type_t::RW; + case 2156: return access_type_t::RW; + case 2160: return access_type_t::RW; + case 2172: return access_type_t::RW; + case 2176: return access_type_t::RW; + case 2180: return access_type_t::RW; + case 2184: return access_type_t::RW; + case 2192: return access_type_t::RW; + case 2196: return access_type_t::RW; + case 2200: return access_type_t::RW; + case 2204: return access_type_t::RW; + case 2208: return access_type_t::RW; + case 2212: return access_type_t::RW; + case 2216: return access_type_t::RW; + case 2220: return access_type_t::RW; + case 2224: return access_type_t::RW; + case 2228: return access_type_t::RW; + case 2232: return access_type_t::RW; + case 2236: return access_type_t::RW; + case 2240: return access_type_t::RW; + case 2244: return access_type_t::RW; + case 2248: return access_type_t::RW; + case 2252: return access_type_t::RW; + case 2256: return access_type_t::RW; + case 2304: return access_type_t::RW; + case 2324: return access_type_t::RW; + case 2340: return access_type_t::RW; + case 2344: return access_type_t::RW; + case 2348: return access_type_t::RW; + case 2352: return access_type_t::RW; + case 2364: return access_type_t::RW; + case 2560: return access_type_t::RW; + case 2568: return access_type_t::RW; + case 2576: return access_type_t::RW; + case 2584: return access_type_t::RW; + case 2592: return access_type_t::RW; + case 2600: return access_type_t::RW; + case 2608: return access_type_t::RW; + case 2624: return access_type_t::RW; + case 2632: return access_type_t::RW; + case 2640: return access_type_t::RW; + case 2648: return access_type_t::RW; + case 2656: return access_type_t::RW; + case 2664: return access_type_t::RW; + case 2672: return access_type_t::RW; + case 2688: return access_type_t::RW; + case 2696: return access_type_t::RW; + case 2704: return access_type_t::RW; + case 2712: return access_type_t::RW; + case 2720: return access_type_t::RW; + case 2728: return access_type_t::RW; + case 2736: return access_type_t::RW; + case 2744: return access_type_t::RW; + case 2752: return access_type_t::RW; + case 2760: return access_type_t::RW; + case 2768: return access_type_t::RW; + case 2776: return access_type_t::RW; + case 2784: return access_type_t::RW; + case 2792: return access_type_t::RW; + case 2800: return access_type_t::RW; + case 2808: return access_type_t::RW; + case 2816: return access_type_t::RW; + case 2824: return access_type_t::RW; + case 2832: return access_type_t::RW; + case 2840: return access_type_t::RW; + case 2848: return access_type_t::RW; + case 2856: return access_type_t::RW; + case 2864: return access_type_t::RW; + case 2880: return access_type_t::RW; + case 2888: return access_type_t::RW; + case 2896: return access_type_t::RW; + case 2904: return access_type_t::RW; + case 2912: return access_type_t::RW; + case 2920: return access_type_t::RW; + case 2928: return access_type_t::RW; + case 2936: return access_type_t::RW; + case 3008: return access_type_t::RW; + case 3016: return access_type_t::RW; + case 4032: return access_type_t::RO; + case 4048: return access_type_t::RO; + case 4052: return access_type_t::RO; + case 4056: return access_type_t::RO; + case 4060: return access_type_t::RO; + case 4064: return access_type_t::RO; + case 4068: return access_type_t::RO; + case 4072: return access_type_t::RO; + case 4076: return access_type_t::RO; + case 4080: return access_type_t::RO; + case 4084: return access_type_t::RO; + case 4088: return access_type_t::RO; + case 4092: return access_type_t::RO; + case 4352: return access_type_t::RO; + case 4356: return access_type_t::RO; + case 4360: return access_type_t::RO; + case 4368: return access_type_t::RO; + case 4372: return access_type_t::RO; + case 4480: return access_type_t::RW; + case 4484: return access_type_t::RW; + case 4488: return access_type_t::RW; + case 4492: return access_type_t::RW; + case 4496: return access_type_t::RW; + case 4500: return access_type_t::RW; + case 4504: return access_type_t::RW; + case 4512: return access_type_t::RW; + case 4520: return access_type_t::RW; + case 4524: return access_type_t::RW; + case 4528: return access_type_t::RW; + case 4864: return access_type_t::RW; + case 4868: return access_type_t::RW; + case 4872: return access_type_t::RW; + case 4876: return access_type_t::RW; + case 4880: return access_type_t::RW; + case 4884: return access_type_t::RW; + case 4888: return access_type_t::RW; + case 4892: return access_type_t::RW; + case 4992: return access_type_t::RW; + case 4996: return access_type_t::RW; + case 5000: return access_type_t::RW; + case 5004: return access_type_t::RW; + case 5008: return access_type_t::RW; + case 5012: return access_type_t::RW; + case 5016: return access_type_t::RW; + case 5020: return access_type_t::RW; + default: return access_type_t::RO; + } + } +#endif +}; + +#ifdef __cplusplus +struct isa +{ +#ifdef NPU_DISASSEMBLE +static int disassemble(const uint32_t* in, std::string& op, std::vector>& fields) +{ + switch (*in & 0xffff) + { + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP): + { + const npu_op_stop_t& v = *reinterpret_cast(in); + op = "NPU_OP_STOP"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ): + { + const npu_op_irq_t& v = *reinterpret_cast(in); + op = "NPU_OP_IRQ"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV): + { + const npu_op_conv_t& v = *reinterpret_cast(in); + op = "NPU_OP_CONV"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE): + { + const npu_op_depthwise_t& v = *reinterpret_cast(in); + op = "NPU_OP_DEPTHWISE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL): + { + const npu_op_pool_t& v = *reinterpret_cast(in); + op = "NPU_OP_POOL"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE): + { + const npu_op_elementwise_t& v = *reinterpret_cast(in); + op = "NPU_OP_ELEMENTWISE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_RESIZE): + { + const npu_op_resize_t& v = *reinterpret_cast(in); + op = "NPU_OP_RESIZE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START): + { + const npu_op_dma_start_t& v = *reinterpret_cast(in); + op = "NPU_OP_DMA_START"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT): + { + const npu_op_dma_wait_t& v = *reinterpret_cast(in); + op = "NPU_OP_DMA_WAIT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT): + { + const npu_op_kernel_wait_t& v = *reinterpret_cast(in); + op = "NPU_OP_KERNEL_WAIT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK): + { + const npu_op_pmu_mask_t& v = *reinterpret_cast(in); + op = "NPU_OP_PMU_MASK"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP): + { + const npu_set_ifm_pad_top_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_PAD_TOP"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT): + { + const npu_set_ifm_pad_left_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_PAD_LEFT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT): + { + const npu_set_ifm_pad_right_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_PAD_RIGHT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM): + { + const npu_set_ifm_pad_bottom_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_PAD_BOTTOM"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1): + { + const npu_set_ifm_depth_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_DEPTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION): + { + const npu_set_ifm_precision_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_PRECISION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE): + { + const npu_set_ifm_upscale_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_UPSCALE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT): + { + const npu_set_ifm_zero_point_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_ZERO_POINT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1): + { + const npu_set_ifm_width0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_WIDTH0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1): + { + const npu_set_ifm_height0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_HEIGHT0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1): + { + const npu_set_ifm_height1_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_HEIGHT1_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION): + { + const npu_set_ifm_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_BROADCAST): + { + const npu_set_ifm_broadcast_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_BROADCAST"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1): + { + const npu_set_ofm_width_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_WIDTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1): + { + const npu_set_ofm_height_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_HEIGHT_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1): + { + const npu_set_ofm_depth_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_DEPTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION): + { + const npu_set_ofm_precision_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_PRECISION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1): + { + const npu_set_ofm_blk_width_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BLK_WIDTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1): + { + const npu_set_ofm_blk_height_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BLK_HEIGHT_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1): + { + const npu_set_ofm_blk_depth_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BLK_DEPTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT): + { + const npu_set_ofm_zero_point_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_ZERO_POINT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1): + { + const npu_set_ofm_width0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_WIDTH0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1): + { + const npu_set_ofm_height0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_HEIGHT0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1): + { + const npu_set_ofm_height1_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_HEIGHT1_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION): + { + const npu_set_ofm_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1): + { + const npu_set_kernel_width_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_KERNEL_WIDTH_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1): + { + const npu_set_kernel_height_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_KERNEL_HEIGHT_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE): + { + const npu_set_kernel_stride_t& v = *reinterpret_cast(in); + op = "NPU_SET_KERNEL_STRIDE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT): + { + const npu_set_acc_format_t& v = *reinterpret_cast(in); + op = "NPU_SET_ACC_FORMAT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION): + { + const npu_set_activation_t& v = *reinterpret_cast(in); + op = "NPU_SET_ACTIVATION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN): + { + const npu_set_activation_min_t& v = *reinterpret_cast(in); + op = "NPU_SET_ACTIVATION_MIN"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX): + { + const npu_set_activation_max_t& v = *reinterpret_cast(in); + op = "NPU_SET_ACTIVATION_MAX"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION): + { + const npu_set_weight_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION): + { + const npu_set_scale_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_SCALE_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_FORMAT): + { + const npu_set_weight_format_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT_FORMAT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP): + { + const npu_set_blockdep_t& v = *reinterpret_cast(in); + op = "NPU_SET_BLOCKDEP"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_SCALE_N_M1): + { + const npu_set_resize_x_scale_n_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_RESIZE_X_SCALE_N_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_SCALE_N_M1): + { + const npu_set_resize_y_scale_n_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_RESIZE_Y_SCALE_N_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_OFFSET): + { + const npu_set_resize_x_offset_t& v = *reinterpret_cast(in); + op = "NPU_SET_RESIZE_X_OFFSET"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_OFFSET): + { + const npu_set_resize_y_offset_t& v = *reinterpret_cast(in); + op = "NPU_SET_RESIZE_Y_OFFSET"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION): + { + const npu_set_dma0_src_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SRC_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION): + { + const npu_set_dma0_dst_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_DST_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0): + { + const npu_set_dma0_size0_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SIZE0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1): + { + const npu_set_dma0_size1_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SIZE1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_IDX_REGION): + { + const npu_set_dma0_idx_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_IDX_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST): + { + const npu_set_ifm2_broadcast_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_BROADCAST"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION): + { + const npu_set_ifm2_precision_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_PRECISION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT): + { + const npu_set_ifm2_zero_point_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_ZERO_POINT"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1): + { + const npu_set_ifm2_width0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_WIDTH0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1): + { + const npu_set_ifm2_height0_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_HEIGHT0_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1): + { + const npu_set_ifm2_height1_m1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_HEIGHT1_M1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION): + { + const npu_set_ifm2_region_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_REGION"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0): + { + const npu_set_ifm_base0_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_BASE0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1): + { + const npu_set_ifm_base1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_BASE1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2): + { + const npu_set_ifm_base2_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_BASE2"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3): + { + const npu_set_ifm_base3_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_BASE3"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X): + { + const npu_set_ifm_stride_x_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_STRIDE_X"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y): + { + const npu_set_ifm_stride_y_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_STRIDE_Y"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C): + { + const npu_set_ifm_stride_c_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_STRIDE_C"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0): + { + const npu_set_ofm_base0_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BASE0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1): + { + const npu_set_ofm_base1_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BASE1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2): + { + const npu_set_ofm_base2_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BASE2"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3): + { + const npu_set_ofm_base3_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_BASE3"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X): + { + const npu_set_ofm_stride_x_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_STRIDE_X"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y): + { + const npu_set_ofm_stride_y_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_STRIDE_Y"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C): + { + const npu_set_ofm_stride_c_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_STRIDE_C"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE): + { + const npu_set_weight_base_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT_BASE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH): + { + const npu_set_weight_length_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT_LENGTH"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE): + { + const npu_set_scale_base_t& v = *reinterpret_cast(in); + op = "NPU_SET_SCALE_BASE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH): + { + const npu_set_scale_length_t& v = *reinterpret_cast(in); + op = "NPU_SET_SCALE_LENGTH"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE): + { + const npu_set_ofm_scale_t& v = *reinterpret_cast(in); + op = "NPU_SET_OFM_SCALE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_SCALE): + { + const npu_set_ifm_scale_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM_SCALE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_SCALE): + { + const npu_set_ifm2_scale_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_SCALE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OP_SCALAR): + { + const npu_set_op_scalar_t& v = *reinterpret_cast(in); + op = "NPU_SET_OP_SCALAR"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC): + { + const npu_set_dma0_src_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SRC"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST): + { + const npu_set_dma0_dst_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_DST"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN): + { + const npu_set_dma0_len_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_LEN"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE0): + { + const npu_set_dma0_src_stride0_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SRC_STRIDE0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE1): + { + const npu_set_dma0_src_stride1_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_SRC_STRIDE1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE0): + { + const npu_set_dma0_dst_stride0_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_DST_STRIDE0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE1): + { + const npu_set_dma0_dst_stride1_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_DST_STRIDE1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX): + { + const npu_set_dma0_idx_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_IDX"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_MAX): + { + const npu_set_dma0_idx_max_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_IDX_MAX"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_SKIP1): + { + const npu_set_dma0_idx_skip1_t& v = *reinterpret_cast(in); + op = "NPU_SET_DMA0_IDX_SKIP1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0): + { + const npu_set_ifm2_base0_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_BASE0"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1): + { + const npu_set_ifm2_base1_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_BASE1"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2): + { + const npu_set_ifm2_base2_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_BASE2"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3): + { + const npu_set_ifm2_base3_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_BASE3"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X): + { + const npu_set_ifm2_stride_x_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_STRIDE_X"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y): + { + const npu_set_ifm2_stride_y_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_STRIDE_Y"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C): + { + const npu_set_ifm2_stride_c_t& v = *reinterpret_cast(in); + op = "NPU_SET_IFM2_STRIDE_C"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE): + { + const npu_set_weight1_base_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT1_BASE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH): + { + const npu_set_weight1_length_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT1_LENGTH"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_BASE): + { + const npu_set_weight2_base_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT2_BASE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_LENGTH): + { + const npu_set_weight2_length_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT2_LENGTH"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_BASE): + { + const npu_set_weight3_base_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT3_BASE"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_LENGTH): + { + const npu_set_weight3_length_t& v = *reinterpret_cast(in); + op = "NPU_SET_WEIGHT3_LENGTH"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_X): + { + const npu_set_resize_x_step_t& v = *reinterpret_cast(in); + op = "NPU_SET_RESIZE_X_STEP"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_Y): + { + const npu_set_resize_y_step_t& v = *reinterpret_cast(in); + op = "NPU_SET_RESIZE_Y_STEP"; + v.disassemble(fields); + break; + } + case (static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL) << 14) | static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_OP_BRANCH): + { + const npu_op_branch_t& v = *reinterpret_cast(in); + op = "NPU_OP_BRANCH"; + v.disassemble(fields); + break; + } + default: break; + } + return (*in & (3<<14)) != 0 ? 2 : 1; +} +#endif +#endif + +struct npu_op_stop_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t mask:16; +#ifdef __cplusplus +public: + npu_op_stop_t(uint32_t _mask) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mask(_mask & ((1U << 16)-1)) + {} + CONSTEXPR npu_op_stop_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mask(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_STOP); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(mask) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_stop_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_stop_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_mask() const + { + return static_cast(mask); + } + CONSTEXPR npu_op_stop_t& set_mask(uint32_t value) + { + assert((value >> 16) == 0); + mask = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("mask", std::to_string(mask))); + } +#endif +#endif +}; + +struct npu_op_irq_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t mask:16; +#ifdef __cplusplus +public: + npu_op_irq_t(uint32_t _mask) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mask(_mask & ((1U << 16)-1)) + {} + CONSTEXPR npu_op_irq_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mask(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_IRQ); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(mask) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_irq_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_irq_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_mask() const + { + return static_cast(mask); + } + CONSTEXPR npu_op_irq_t& set_mask(uint32_t value) + { + assert((value >> 16) == 0); + mask = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("mask", std::to_string(mask))); + } +#endif +#endif +}; + +struct npu_op_conv_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t weights_ifm2:1; + uint32_t reserved1:15; +#ifdef __cplusplus +public: + npu_op_conv_t(uint32_t _weights_ifm2) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + weights_ifm2(_weights_ifm2 & ((1U << 1)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_conv_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + weights_ifm2(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_CONV); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(weights_ifm2) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_conv_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_conv_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_weights_ifm2() const + { + return static_cast(weights_ifm2); + } + CONSTEXPR npu_op_conv_t& set_weights_ifm2(uint32_t value) + { + assert((value >> 1) == 0); + weights_ifm2 = static_cast(value & ((1U << 1)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("weights_ifm2", std::to_string(weights_ifm2))); + } +#endif +#endif +}; + +struct npu_op_depthwise_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; +#ifdef __cplusplus +public: + CONSTEXPR npu_op_depthwise_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DEPTHWISE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_depthwise_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_depthwise_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>&) const + { + } +#endif +#endif +}; + +struct npu_op_pool_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t pooling_mode:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_op_pool_t(NPU_NAMESPACE::pooling_mode _pooling_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pooling_mode(static_cast(_pooling_mode) & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_pool_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pooling_mode(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_POOL); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(pooling_mode) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_pool_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_pool_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::pooling_mode get_pooling_mode() const + { + return static_cast(pooling_mode); + } + CONSTEXPR npu_op_pool_t& set_pooling_mode(NPU_NAMESPACE::pooling_mode value) + { + pooling_mode = static_cast(value) & ((1U << 3)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("pooling_mode", (pooling_mode < (sizeof(pooling_mode_str)/sizeof(pooling_mode_str[0])) ? pooling_mode_str[pooling_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_op_elementwise_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t elementwise_mode:6; + uint32_t reserved1:10; +#ifdef __cplusplus +public: + npu_op_elementwise_t(NPU_NAMESPACE::elementwise_mode _elementwise_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + elementwise_mode(static_cast(_elementwise_mode) & ((1U << 6)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_elementwise_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + elementwise_mode(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_ELEMENTWISE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(elementwise_mode) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_elementwise_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_elementwise_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::elementwise_mode get_elementwise_mode() const + { + return static_cast(elementwise_mode); + } + CONSTEXPR npu_op_elementwise_t& set_elementwise_mode(NPU_NAMESPACE::elementwise_mode value) + { + elementwise_mode = static_cast(value) & ((1U << 6)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("elementwise_mode", (elementwise_mode < (sizeof(elementwise_mode_str)/sizeof(elementwise_mode_str[0])) ? elementwise_mode_str[elementwise_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_op_resize_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t resize_mode:2; + uint32_t reserved1:14; +#ifdef __cplusplus +public: + npu_op_resize_t(NPU_NAMESPACE::resize_mode _resize_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_RESIZE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + resize_mode(static_cast(_resize_mode) & ((1U << 2)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_resize_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_RESIZE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + resize_mode(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_RESIZE) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_RESIZE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(resize_mode) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_resize_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_resize_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::resize_mode get_resize_mode() const + { + return static_cast(resize_mode); + } + CONSTEXPR npu_op_resize_t& set_resize_mode(NPU_NAMESPACE::resize_mode value) + { + resize_mode = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("resize_mode", (resize_mode < (sizeof(resize_mode_str)/sizeof(resize_mode_str[0])) ? resize_mode_str[resize_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_op_dma_start_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; +#ifdef __cplusplus +public: + CONSTEXPR npu_op_dma_start_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_START); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_dma_start_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_dma_start_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>&) const + { + } +#endif +#endif +}; + +struct npu_op_dma_wait_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t k:2; + uint32_t reserved1:14; +#ifdef __cplusplus +public: + npu_op_dma_wait_t(uint32_t _k) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + k(_k & ((1U << 2)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_dma_wait_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + k(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_DMA_WAIT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(k) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_dma_wait_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_dma_wait_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_k() const + { + return static_cast(k); + } + CONSTEXPR npu_op_dma_wait_t& set_k(uint32_t value) + { + assert((value >> 2) == 0); + k = static_cast(value & ((1U << 2)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("k", std::to_string(k))); + } +#endif +#endif +}; + +struct npu_op_kernel_wait_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t n:1; + uint32_t reserved1:15; +#ifdef __cplusplus +public: + npu_op_kernel_wait_t(uint32_t _n) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + n(_n & ((1U << 1)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_kernel_wait_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + n(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_KERNEL_WAIT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(n) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_kernel_wait_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_kernel_wait_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_n() const + { + return static_cast(n); + } + CONSTEXPR npu_op_kernel_wait_t& set_n(uint32_t value) + { + assert((value >> 1) == 0); + n = static_cast(value & ((1U << 1)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("n", std::to_string(n))); + } +#endif +#endif +}; + +struct npu_op_pmu_mask_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t enable:1; + uint32_t reserved1:15; +#ifdef __cplusplus +public: + npu_op_pmu_mask_t(uint32_t _enable) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + enable(_enable & ((1U << 1)-1)), + reserved1(0) + {} + CONSTEXPR npu_op_pmu_mask_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + enable(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_OP_PMU_MASK); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(enable) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_pmu_mask_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_pmu_mask_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_enable() const + { + return static_cast(enable); + } + CONSTEXPR npu_op_pmu_mask_t& set_enable(uint32_t value) + { + assert((value >> 1) == 0); + enable = static_cast(value & ((1U << 1)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("enable", std::to_string(enable))); + } +#endif +#endif +}; + +struct npu_set_ifm_pad_top_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t pad:7; + uint32_t reserved1:9; +#ifdef __cplusplus +public: + npu_set_ifm_pad_top_t(uint32_t _pad) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(_pad & ((1U << 7)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_pad_top_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_TOP); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(pad) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_pad_top_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_pad_top_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_pad() const + { + return static_cast(pad); + } + CONSTEXPR npu_set_ifm_pad_top_t& set_pad(uint32_t value) + { + assert((value >> 7) == 0); + pad = static_cast(value & ((1U << 7)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("pad", std::to_string(pad))); + } +#endif +#endif +}; + +struct npu_set_ifm_pad_left_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t pad:7; + uint32_t reserved1:9; +#ifdef __cplusplus +public: + npu_set_ifm_pad_left_t(uint32_t _pad) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(_pad & ((1U << 7)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_pad_left_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_LEFT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(pad) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_pad_left_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_pad_left_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_pad() const + { + return static_cast(pad); + } + CONSTEXPR npu_set_ifm_pad_left_t& set_pad(uint32_t value) + { + assert((value >> 7) == 0); + pad = static_cast(value & ((1U << 7)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("pad", std::to_string(pad))); + } +#endif +#endif +}; + +struct npu_set_ifm_pad_right_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t pad:8; + uint32_t reserved1:8; +#ifdef __cplusplus +public: + npu_set_ifm_pad_right_t(uint32_t _pad) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(_pad & ((1U << 8)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_pad_right_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_RIGHT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(pad) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_pad_right_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_pad_right_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_pad() const + { + return static_cast(pad); + } + CONSTEXPR npu_set_ifm_pad_right_t& set_pad(uint32_t value) + { + assert((value >> 8) == 0); + pad = static_cast(value & ((1U << 8)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("pad", std::to_string(pad))); + } +#endif +#endif +}; + +struct npu_set_ifm_pad_bottom_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t pad:8; + uint32_t reserved1:8; +#ifdef __cplusplus +public: + npu_set_ifm_pad_bottom_t(uint32_t _pad) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(_pad & ((1U << 8)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_pad_bottom_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + pad(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PAD_BOTTOM); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(pad) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_pad_bottom_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_pad_bottom_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_pad() const + { + return static_cast(pad); + } + CONSTEXPR npu_set_ifm_pad_bottom_t& set_pad(uint32_t value) + { + assert((value >> 8) == 0); + pad = static_cast(value & ((1U << 8)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("pad", std::to_string(pad))); + } +#endif +#endif +}; + +struct npu_set_ifm_depth_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t depth_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm_depth_m1_t(uint32_t _depth_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(_depth_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm_depth_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_DEPTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(depth_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_depth_m1() const + { + return static_cast(depth_m1); + } + CONSTEXPR npu_set_ifm_depth_m1_t& set_depth_m1(uint32_t value) + { + assert((value >> 16) == 0); + depth_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("depth_m1", std::to_string(depth_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm_precision_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t activation_type:1; + uint32_t reserved1:1; + uint32_t activation_precision:2; + uint32_t reserved2:2; + uint32_t activation_format:2; + uint32_t reserved3:6; + uint32_t activation_storage:2; +#ifdef __cplusplus +public: + npu_set_ifm_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format, NPU_NAMESPACE::activation_storage _activation_storage) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(static_cast(_activation_type) & ((1U << 1)-1)), + reserved1(0), + activation_precision(static_cast(_activation_precision) & ((1U << 2)-1)), + reserved2(0), + activation_format(static_cast(_activation_format) & ((1U << 2)-1)), + reserved3(0), + activation_storage(static_cast(_activation_storage) & ((1U << 2)-1)) + {} + CONSTEXPR npu_set_ifm_precision_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(0), + reserved1(0), + activation_precision(0), + reserved2(0), + activation_format(0), + reserved3(0), + activation_storage(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_PRECISION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(activation_type) << 16; + word |= uint32_t(activation_precision) << 18; + word |= uint32_t(activation_format) << 22; + word |= uint32_t(activation_storage) << 30; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const + { + return static_cast(activation_type); + } + CONSTEXPR npu_set_ifm_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value) + { + activation_type = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const + { + return static_cast(activation_precision); + } + CONSTEXPR npu_set_ifm_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value) + { + activation_precision = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const + { + return static_cast(activation_format); + } + CONSTEXPR npu_set_ifm_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value) + { + activation_format = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_storage get_activation_storage() const + { + return static_cast(activation_storage); + } + CONSTEXPR npu_set_ifm_precision_t& set_activation_storage(NPU_NAMESPACE::activation_storage value) + { + activation_storage = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****"))); + fields.push_back(std::make_pair("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****"))); + fields.push_back(std::make_pair("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****"))); + fields.push_back(std::make_pair("activation_storage", (activation_storage < (sizeof(activation_storage_str)/sizeof(activation_storage_str[0])) ? activation_storage_str[activation_storage] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ifm_upscale_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t mode:2; + uint32_t reserved1:14; +#ifdef __cplusplus +public: + npu_set_ifm_upscale_t(NPU_NAMESPACE::ifm_upscale_mode _mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mode(static_cast(_mode) & ((1U << 2)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_upscale_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + mode(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_UPSCALE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(mode) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_upscale_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_upscale_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::ifm_upscale_mode get_mode() const + { + return static_cast(mode); + } + CONSTEXPR npu_set_ifm_upscale_t& set_mode(NPU_NAMESPACE::ifm_upscale_mode value) + { + mode = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("mode", (mode < (sizeof(ifm_upscale_mode_str)/sizeof(ifm_upscale_mode_str[0])) ? ifm_upscale_mode_str[mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ifm_zero_point_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t zero_point:16; +#ifdef __cplusplus +public: + npu_set_ifm_zero_point_t(uint32_t _zero_point) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(_zero_point & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm_zero_point_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_ZERO_POINT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(zero_point) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_zero_point() const + { + return static_cast(zero_point); + } + CONSTEXPR npu_set_ifm_zero_point_t& set_zero_point(uint32_t value) + { + assert((value >> 16) == 0); + zero_point = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("zero_point", std::to_string(zero_point))); + } +#endif +#endif +}; + +struct npu_set_ifm_width0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm_width0_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm_width0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_WIDTH0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_ifm_width0_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 16) == 0); + width_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm_height0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm_height0_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm_height0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ifm_height0_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm_height1_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm_height1_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm_height1_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_HEIGHT1_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ifm_height1_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_ifm_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_ifm_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_ifm_broadcast_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t broadcast_mode:4; + uint32_t reserved1:12; +#ifdef __cplusplus +public: + npu_set_ifm_broadcast_t(NPU_NAMESPACE::broadcast_mode _broadcast_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_BROADCAST)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + broadcast_mode(static_cast(_broadcast_mode) & ((1U << 4)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm_broadcast_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_BROADCAST)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + broadcast_mode(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_BROADCAST) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM_BROADCAST); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(broadcast_mode) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_broadcast_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_broadcast_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_mode() const + { + return static_cast(broadcast_mode); + } + CONSTEXPR npu_set_ifm_broadcast_t& set_broadcast_mode(NPU_NAMESPACE::broadcast_mode value) + { + broadcast_mode = static_cast(value) & ((1U << 4)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("broadcast_mode", (broadcast_mode < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ofm_width_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_width_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_width_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_ofm_width_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 16) == 0); + width_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_height_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_height_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_height_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ofm_height_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_depth_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t depth_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_depth_m1_t(uint32_t _depth_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(_depth_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_depth_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_DEPTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(depth_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_depth_m1() const + { + return static_cast(depth_m1); + } + CONSTEXPR npu_set_ofm_depth_m1_t& set_depth_m1(uint32_t value) + { + assert((value >> 16) == 0); + depth_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("depth_m1", std::to_string(depth_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_precision_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t activation_type:1; + uint32_t activation_precision:2; + uint32_t reserved1:3; + uint32_t activation_format:2; + uint32_t scale_mode:1; + uint32_t activation_reverse:2; + uint32_t activation_transpose:3; + uint32_t activation_storage:2; +#ifdef __cplusplus +public: + npu_set_ofm_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format, NPU_NAMESPACE::ofm_scale_mode _scale_mode, NPU_NAMESPACE::activation_reverse _activation_reverse, NPU_NAMESPACE::activation_transpose _activation_transpose, NPU_NAMESPACE::activation_storage _activation_storage) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(static_cast(_activation_type) & ((1U << 1)-1)), + activation_precision(static_cast(_activation_precision) & ((1U << 2)-1)), + reserved1(0), + activation_format(static_cast(_activation_format) & ((1U << 2)-1)), + scale_mode(static_cast(_scale_mode) & ((1U << 1)-1)), + activation_reverse(static_cast(_activation_reverse) & ((1U << 2)-1)), + activation_transpose(static_cast(_activation_transpose) & ((1U << 3)-1)), + activation_storage(static_cast(_activation_storage) & ((1U << 2)-1)) + {} + CONSTEXPR npu_set_ofm_precision_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(0), + activation_precision(0), + reserved1(0), + activation_format(0), + scale_mode(0), + activation_reverse(0), + activation_transpose(0), + activation_storage(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_PRECISION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(activation_type) << 16; + word |= uint32_t(activation_precision) << 17; + word |= uint32_t(activation_format) << 22; + word |= uint32_t(scale_mode) << 24; + word |= uint32_t(activation_reverse) << 25; + word |= uint32_t(activation_transpose) << 27; + word |= uint32_t(activation_storage) << 30; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const + { + return static_cast(activation_type); + } + CONSTEXPR npu_set_ofm_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value) + { + activation_type = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const + { + return static_cast(activation_precision); + } + CONSTEXPR npu_set_ofm_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value) + { + activation_precision = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const + { + return static_cast(activation_format); + } + CONSTEXPR npu_set_ofm_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value) + { + activation_format = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::ofm_scale_mode get_scale_mode() const + { + return static_cast(scale_mode); + } + CONSTEXPR npu_set_ofm_precision_t& set_scale_mode(NPU_NAMESPACE::ofm_scale_mode value) + { + scale_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_reverse get_activation_reverse() const + { + return static_cast(activation_reverse); + } + CONSTEXPR npu_set_ofm_precision_t& set_activation_reverse(NPU_NAMESPACE::activation_reverse value) + { + activation_reverse = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_transpose get_activation_transpose() const + { + return static_cast(activation_transpose); + } + CONSTEXPR npu_set_ofm_precision_t& set_activation_transpose(NPU_NAMESPACE::activation_transpose value) + { + activation_transpose = static_cast(value) & ((1U << 3)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_storage get_activation_storage() const + { + return static_cast(activation_storage); + } + CONSTEXPR npu_set_ofm_precision_t& set_activation_storage(NPU_NAMESPACE::activation_storage value) + { + activation_storage = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****"))); + fields.push_back(std::make_pair("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****"))); + fields.push_back(std::make_pair("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****"))); + fields.push_back(std::make_pair("scale_mode", (scale_mode < (sizeof(ofm_scale_mode_str)/sizeof(ofm_scale_mode_str[0])) ? ofm_scale_mode_str[scale_mode] : "****"))); + fields.push_back(std::make_pair("activation_reverse", (activation_reverse < (sizeof(activation_reverse_str)/sizeof(activation_reverse_str[0])) ? activation_reverse_str[activation_reverse] : "****"))); + fields.push_back(std::make_pair("activation_transpose", (activation_transpose < (sizeof(activation_transpose_str)/sizeof(activation_transpose_str[0])) ? activation_transpose_str[activation_transpose] : "****"))); + fields.push_back(std::make_pair("activation_storage", (activation_storage < (sizeof(activation_storage_str)/sizeof(activation_storage_str[0])) ? activation_storage_str[activation_storage] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ofm_blk_width_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:7; + uint32_t reserved1:9; +#ifdef __cplusplus +public: + npu_set_ofm_blk_width_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 7)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ofm_blk_width_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_WIDTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_blk_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_blk_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_ofm_blk_width_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 7) == 0); + width_m1 = static_cast(value & ((1U << 7)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_blk_height_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:7; + uint32_t reserved1:9; +#ifdef __cplusplus +public: + npu_set_ofm_blk_height_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 7)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ofm_blk_height_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_HEIGHT_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_blk_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_blk_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ofm_blk_height_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 7) == 0); + height_m1 = static_cast(value & ((1U << 7)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_blk_depth_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t depth_m1:10; + uint32_t reserved1:6; +#ifdef __cplusplus +public: + npu_set_ofm_blk_depth_m1_t(uint32_t _depth_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(_depth_m1 & ((1U << 10)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ofm_blk_depth_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + depth_m1(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_BLK_DEPTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(depth_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_depth_m1() const + { + return static_cast(depth_m1); + } + CONSTEXPR npu_set_ofm_blk_depth_m1_t& set_depth_m1(uint32_t value) + { + assert((value >> 10) == 0); + depth_m1 = static_cast(value & ((1U << 10)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("depth_m1", std::to_string(depth_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_zero_point_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t zero_point:16; +#ifdef __cplusplus +public: + npu_set_ofm_zero_point_t(uint32_t _zero_point) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(_zero_point & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_zero_point_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_ZERO_POINT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(zero_point) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_zero_point() const + { + return static_cast(zero_point); + } + CONSTEXPR npu_set_ofm_zero_point_t& set_zero_point(uint32_t value) + { + assert((value >> 16) == 0); + zero_point = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("zero_point", std::to_string(zero_point))); + } +#endif +#endif +}; + +struct npu_set_ofm_width0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_width0_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_width0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_WIDTH0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_ofm_width0_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 16) == 0); + width_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_height0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_height0_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_height0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ofm_height0_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_height1_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ofm_height1_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ofm_height1_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_HEIGHT1_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ofm_height1_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ofm_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_ofm_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ofm_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_OFM_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_ofm_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_kernel_width_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:16; +#ifdef __cplusplus +public: + npu_set_kernel_width_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_kernel_width_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_WIDTH_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_kernel_width_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_kernel_width_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_kernel_width_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 16) == 0); + width_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_kernel_height_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_kernel_height_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_kernel_height_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_HEIGHT_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_kernel_height_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_kernel_height_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_kernel_height_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_kernel_stride_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t stride_x_lsb:1; + uint32_t stride_y_lsb:1; + uint32_t weight_order:1; + uint32_t dilation_x:1; + uint32_t dilation_y:1; + uint32_t decomposition:1; + uint32_t stride_x_msb:1; + uint32_t reserved1:2; + uint32_t stride_y_msb:1; + uint32_t reserved2:6; +#ifdef __cplusplus +public: + npu_set_kernel_stride_t(uint32_t _stride_x_lsb, uint32_t _stride_y_lsb, NPU_NAMESPACE::weight_order _weight_order, NPU_NAMESPACE::kernel_dilation _dilation_x, NPU_NAMESPACE::kernel_dilation _dilation_y, NPU_NAMESPACE::kernel_decomposition _decomposition, uint32_t _stride_x_msb, uint32_t _stride_y_msb) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + stride_x_lsb(_stride_x_lsb & ((1U << 1)-1)), + stride_y_lsb(_stride_y_lsb & ((1U << 1)-1)), + weight_order(static_cast(_weight_order) & ((1U << 1)-1)), + dilation_x(static_cast(_dilation_x) & ((1U << 1)-1)), + dilation_y(static_cast(_dilation_y) & ((1U << 1)-1)), + decomposition(static_cast(_decomposition) & ((1U << 1)-1)), + stride_x_msb(_stride_x_msb & ((1U << 1)-1)), + reserved1(0), + stride_y_msb(_stride_y_msb & ((1U << 1)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_kernel_stride_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + stride_x_lsb(0), + stride_y_lsb(0), + weight_order(0), + dilation_x(0), + dilation_y(0), + decomposition(0), + stride_x_msb(0), + reserved1(0), + stride_y_msb(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_KERNEL_STRIDE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(stride_x_lsb) << 16; + word |= uint32_t(stride_y_lsb) << 17; + word |= uint32_t(weight_order) << 18; + word |= uint32_t(dilation_x) << 19; + word |= uint32_t(dilation_y) << 20; + word |= uint32_t(decomposition) << 21; + word |= uint32_t(stride_x_msb) << 22; + word |= uint32_t(stride_y_msb) << 25; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_kernel_stride_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_kernel_stride_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_stride_x_lsb() const + { + return static_cast(stride_x_lsb); + } + CONSTEXPR npu_set_kernel_stride_t& set_stride_x_lsb(uint32_t value) + { + assert((value >> 1) == 0); + stride_x_lsb = static_cast(value & ((1U << 1)-1)); + return *this; + } + CONSTEXPR uint32_t get_stride_y_lsb() const + { + return static_cast(stride_y_lsb); + } + CONSTEXPR npu_set_kernel_stride_t& set_stride_y_lsb(uint32_t value) + { + assert((value >> 1) == 0); + stride_y_lsb = static_cast(value & ((1U << 1)-1)); + return *this; + } + CONSTEXPR NPU_NAMESPACE::weight_order get_weight_order() const + { + return static_cast(weight_order); + } + CONSTEXPR npu_set_kernel_stride_t& set_weight_order(NPU_NAMESPACE::weight_order value) + { + weight_order = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::kernel_dilation get_dilation_x() const + { + return static_cast(dilation_x); + } + CONSTEXPR npu_set_kernel_stride_t& set_dilation_x(NPU_NAMESPACE::kernel_dilation value) + { + dilation_x = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::kernel_dilation get_dilation_y() const + { + return static_cast(dilation_y); + } + CONSTEXPR npu_set_kernel_stride_t& set_dilation_y(NPU_NAMESPACE::kernel_dilation value) + { + dilation_y = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::kernel_decomposition get_decomposition() const + { + return static_cast(decomposition); + } + CONSTEXPR npu_set_kernel_stride_t& set_decomposition(NPU_NAMESPACE::kernel_decomposition value) + { + decomposition = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR uint32_t get_stride_x_msb() const + { + return static_cast(stride_x_msb); + } + CONSTEXPR npu_set_kernel_stride_t& set_stride_x_msb(uint32_t value) + { + assert((value >> 1) == 0); + stride_x_msb = static_cast(value & ((1U << 1)-1)); + return *this; + } + CONSTEXPR uint32_t get_stride_y_msb() const + { + return static_cast(stride_y_msb); + } + CONSTEXPR npu_set_kernel_stride_t& set_stride_y_msb(uint32_t value) + { + assert((value >> 1) == 0); + stride_y_msb = static_cast(value & ((1U << 1)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("stride_x_lsb", std::to_string(stride_x_lsb))); + fields.push_back(std::make_pair("stride_y_lsb", std::to_string(stride_y_lsb))); + fields.push_back(std::make_pair("weight_order", (weight_order < (sizeof(weight_order_str)/sizeof(weight_order_str[0])) ? weight_order_str[weight_order] : "****"))); + fields.push_back(std::make_pair("dilation_x", (dilation_x < (sizeof(kernel_dilation_str)/sizeof(kernel_dilation_str[0])) ? kernel_dilation_str[dilation_x] : "****"))); + fields.push_back(std::make_pair("dilation_y", (dilation_y < (sizeof(kernel_dilation_str)/sizeof(kernel_dilation_str[0])) ? kernel_dilation_str[dilation_y] : "****"))); + fields.push_back(std::make_pair("decomposition", (decomposition < (sizeof(kernel_decomposition_str)/sizeof(kernel_decomposition_str[0])) ? kernel_decomposition_str[decomposition] : "****"))); + fields.push_back(std::make_pair("stride_x_msb", std::to_string(stride_x_msb))); + fields.push_back(std::make_pair("stride_y_msb", std::to_string(stride_y_msb))); + } +#endif +#endif +}; + +struct npu_set_acc_format_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t acc_format:2; + uint32_t reserved1:2; + uint32_t acc_input:2; + uint32_t acc_output:1; + uint32_t reserved2:1; + uint32_t microblock:3; + uint32_t reserved3:5; +#ifdef __cplusplus +public: + npu_set_acc_format_t(NPU_NAMESPACE::acc_format _acc_format, NPU_NAMESPACE::acc_input _acc_input, NPU_NAMESPACE::acc_output _acc_output, NPU_NAMESPACE::microblock _microblock) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + acc_format(static_cast(_acc_format) & ((1U << 2)-1)), + reserved1(0), + acc_input(static_cast(_acc_input) & ((1U << 2)-1)), + acc_output(static_cast(_acc_output) & ((1U << 1)-1)), + reserved2(0), + microblock(static_cast(_microblock) & ((1U << 3)-1)), + reserved3(0) + {} + CONSTEXPR npu_set_acc_format_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + acc_format(0), + reserved1(0), + acc_input(0), + acc_output(0), + reserved2(0), + microblock(0), + reserved3(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACC_FORMAT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(acc_format) << 16; + word |= uint32_t(acc_input) << 20; + word |= uint32_t(acc_output) << 22; + word |= uint32_t(microblock) << 24; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_acc_format_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_acc_format_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::acc_format get_acc_format() const + { + return static_cast(acc_format); + } + CONSTEXPR npu_set_acc_format_t& set_acc_format(NPU_NAMESPACE::acc_format value) + { + acc_format = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::acc_input get_acc_input() const + { + return static_cast(acc_input); + } + CONSTEXPR npu_set_acc_format_t& set_acc_input(NPU_NAMESPACE::acc_input value) + { + acc_input = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::acc_output get_acc_output() const + { + return static_cast(acc_output); + } + CONSTEXPR npu_set_acc_format_t& set_acc_output(NPU_NAMESPACE::acc_output value) + { + acc_output = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::microblock get_microblock() const + { + return static_cast(microblock); + } + CONSTEXPR npu_set_acc_format_t& set_microblock(NPU_NAMESPACE::microblock value) + { + microblock = static_cast(value) & ((1U << 3)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("acc_format", (acc_format < (sizeof(acc_format_str)/sizeof(acc_format_str[0])) ? acc_format_str[acc_format] : "****"))); + fields.push_back(std::make_pair("acc_input", (acc_input < (sizeof(acc_input_str)/sizeof(acc_input_str[0])) ? acc_input_str[acc_input] : "****"))); + fields.push_back(std::make_pair("acc_output", (acc_output < (sizeof(acc_output_str)/sizeof(acc_output_str[0])) ? acc_output_str[acc_output] : "****"))); + fields.push_back(std::make_pair("microblock", (microblock < (sizeof(microblock_str)/sizeof(microblock_str[0])) ? microblock_str[microblock] : "****"))); + } +#endif +#endif +}; + +struct npu_set_activation_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t activation_function:5; + uint32_t table:3; + uint32_t reserved1:4; + uint32_t activation_clip_range:1; + uint32_t reserved2:3; +#ifdef __cplusplus +public: + npu_set_activation_t(NPU_NAMESPACE::activation_function _activation_function, uint32_t _table, NPU_NAMESPACE::activation_clip_range _activation_clip_range) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_function(static_cast(_activation_function) & ((1U << 5)-1)), + table(_table & ((1U << 3)-1)), + reserved1(0), + activation_clip_range(static_cast(_activation_clip_range) & ((1U << 1)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_activation_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_function(0), + table(0), + reserved1(0), + activation_clip_range(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(activation_function) << 16; + word |= uint32_t(table) << 21; + word |= uint32_t(activation_clip_range) << 28; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_activation_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_activation_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_function get_activation_function() const + { + return static_cast(activation_function); + } + CONSTEXPR npu_set_activation_t& set_activation_function(NPU_NAMESPACE::activation_function value) + { + activation_function = static_cast(value) & ((1U << 5)-1); + return *this; + } + CONSTEXPR uint32_t get_table() const + { + return static_cast(table); + } + CONSTEXPR npu_set_activation_t& set_table(uint32_t value) + { + assert((value >> 3) == 0); + table = static_cast(value & ((1U << 3)-1)); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_clip_range get_activation_clip_range() const + { + return static_cast(activation_clip_range); + } + CONSTEXPR npu_set_activation_t& set_activation_clip_range(NPU_NAMESPACE::activation_clip_range value) + { + activation_clip_range = static_cast(value) & ((1U << 1)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("activation_function", (activation_function < (sizeof(activation_function_str)/sizeof(activation_function_str[0])) ? activation_function_str[activation_function] : "****"))); + fields.push_back(std::make_pair("table", std::to_string(table))); + fields.push_back(std::make_pair("activation_clip_range", (activation_clip_range < (sizeof(activation_clip_range_str)/sizeof(activation_clip_range_str[0])) ? activation_clip_range_str[activation_clip_range] : "****"))); + } +#endif +#endif +}; + +struct npu_set_activation_min_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t clip_boundary:16; +#ifdef __cplusplus +public: + npu_set_activation_min_t(uint32_t _clip_boundary) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + clip_boundary(_clip_boundary & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_activation_min_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + clip_boundary(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MIN); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(clip_boundary) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_activation_min_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_activation_min_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_clip_boundary() const + { + return static_cast(clip_boundary); + } + CONSTEXPR npu_set_activation_min_t& set_clip_boundary(uint32_t value) + { + assert((value >> 16) == 0); + clip_boundary = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("clip_boundary", std::to_string(clip_boundary))); + } +#endif +#endif +}; + +struct npu_set_activation_max_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t clip_boundary:16; +#ifdef __cplusplus +public: + npu_set_activation_max_t(uint32_t _clip_boundary) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + clip_boundary(_clip_boundary & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_activation_max_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + clip_boundary(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_ACTIVATION_MAX); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(clip_boundary) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_activation_max_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_activation_max_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_clip_boundary() const + { + return static_cast(clip_boundary); + } + CONSTEXPR npu_set_activation_max_t& set_clip_boundary(uint32_t value) + { + assert((value >> 16) == 0); + clip_boundary = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("clip_boundary", std::to_string(clip_boundary))); + } +#endif +#endif +}; + +struct npu_set_weight_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_weight_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_weight_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_weight_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_weight_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_weight_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_scale_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_scale_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_scale_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_SCALE_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_scale_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_scale_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_scale_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_weight_format_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t weight_format:1; + uint32_t reserved1:3; + uint32_t weight_sparsity:1; + uint32_t reserved2:11; +#ifdef __cplusplus +public: + npu_set_weight_format_t(NPU_NAMESPACE::weight_format _weight_format, NPU_NAMESPACE::weight_sparsity _weight_sparsity) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_FORMAT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + weight_format(static_cast(_weight_format) & ((1U << 1)-1)), + reserved1(0), + weight_sparsity(static_cast(_weight_sparsity) & ((1U << 1)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_weight_format_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_FORMAT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + weight_format(0), + reserved1(0), + weight_sparsity(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_FORMAT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_WEIGHT_FORMAT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(weight_format) << 16; + word |= uint32_t(weight_sparsity) << 20; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_weight_format_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_weight_format_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::weight_format get_weight_format() const + { + return static_cast(weight_format); + } + CONSTEXPR npu_set_weight_format_t& set_weight_format(NPU_NAMESPACE::weight_format value) + { + weight_format = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::weight_sparsity get_weight_sparsity() const + { + return static_cast(weight_sparsity); + } + CONSTEXPR npu_set_weight_format_t& set_weight_sparsity(NPU_NAMESPACE::weight_sparsity value) + { + weight_sparsity = static_cast(value) & ((1U << 1)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("weight_format", (weight_format < (sizeof(weight_format_str)/sizeof(weight_format_str[0])) ? weight_format_str[weight_format] : "****"))); + fields.push_back(std::make_pair("weight_sparsity", (weight_sparsity < (sizeof(weight_sparsity_str)/sizeof(weight_sparsity_str[0])) ? weight_sparsity_str[weight_sparsity] : "****"))); + } +#endif +#endif +}; + +struct npu_set_blockdep_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t blockdep:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_blockdep_t(uint32_t _blockdep) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + blockdep(_blockdep & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_blockdep_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + blockdep(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_BLOCKDEP); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(blockdep) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_blockdep_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_blockdep_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_blockdep() const + { + return static_cast(blockdep); + } + CONSTEXPR npu_set_blockdep_t& set_blockdep(uint32_t value) + { + assert((value >> 3) == 0); + blockdep = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("blockdep", std::to_string(blockdep))); + } +#endif +#endif +}; + +struct npu_set_resize_x_scale_n_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t resize_x_scale_n_m1:11; + uint32_t reserved1:5; +#ifdef __cplusplus +public: + npu_set_resize_x_scale_n_m1_t(uint32_t _resize_x_scale_n_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_SCALE_N_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + resize_x_scale_n_m1(_resize_x_scale_n_m1 & ((1U << 11)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_resize_x_scale_n_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_SCALE_N_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + resize_x_scale_n_m1(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_SCALE_N_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_SCALE_N_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(resize_x_scale_n_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_resize_x_scale_n_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_resize_x_scale_n_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_resize_x_scale_n_m1() const + { + return static_cast(resize_x_scale_n_m1); + } + CONSTEXPR npu_set_resize_x_scale_n_m1_t& set_resize_x_scale_n_m1(uint32_t value) + { + assert((value >> 11) == 0); + resize_x_scale_n_m1 = static_cast(value & ((1U << 11)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("resize_x_scale_n_m1", std::to_string(resize_x_scale_n_m1))); + } +#endif +#endif +}; + +struct npu_set_resize_y_scale_n_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t resize_y_scale_n_m1:11; + uint32_t reserved1:5; +#ifdef __cplusplus +public: + npu_set_resize_y_scale_n_m1_t(uint32_t _resize_y_scale_n_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_SCALE_N_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + resize_y_scale_n_m1(_resize_y_scale_n_m1 & ((1U << 11)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_resize_y_scale_n_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_SCALE_N_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + resize_y_scale_n_m1(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_SCALE_N_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_SCALE_N_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(resize_y_scale_n_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_resize_y_scale_n_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_resize_y_scale_n_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_resize_y_scale_n_m1() const + { + return static_cast(resize_y_scale_n_m1); + } + CONSTEXPR npu_set_resize_y_scale_n_m1_t& set_resize_y_scale_n_m1(uint32_t value) + { + assert((value >> 11) == 0); + resize_y_scale_n_m1 = static_cast(value & ((1U << 11)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("resize_y_scale_n_m1", std::to_string(resize_y_scale_n_m1))); + } +#endif +#endif +}; + +struct npu_set_resize_x_offset_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t resize_x_offset:12; + uint32_t reserved1:4; +#ifdef __cplusplus +public: + npu_set_resize_x_offset_t(uint32_t _resize_x_offset) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_OFFSET)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + resize_x_offset(_resize_x_offset & ((1U << 12)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_resize_x_offset_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_OFFSET)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + resize_x_offset(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_OFFSET) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_X_OFFSET); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(resize_x_offset) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_resize_x_offset_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_resize_x_offset_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_resize_x_offset() const + { + return static_cast(resize_x_offset); + } + CONSTEXPR npu_set_resize_x_offset_t& set_resize_x_offset(uint32_t value) + { + assert((value >> 12) == 0); + resize_x_offset = static_cast(value & ((1U << 12)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("resize_x_offset", std::to_string(((resize_x_offset <= std::numeric_limits::max() ? static_cast(resize_x_offset) : resize_x_offset - std::numeric_limits::min() + std::numeric_limits::max()) << 20) >> 20))); + } +#endif +#endif +}; + +struct npu_set_resize_y_offset_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t resize_y_offset:12; + uint32_t reserved1:4; +#ifdef __cplusplus +public: + npu_set_resize_y_offset_t(uint32_t _resize_y_offset) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_OFFSET)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + resize_y_offset(_resize_y_offset & ((1U << 12)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_resize_y_offset_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_OFFSET)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + resize_y_offset(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_OFFSET) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_RESIZE_Y_OFFSET); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(resize_y_offset) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_resize_y_offset_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_resize_y_offset_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_resize_y_offset() const + { + return static_cast(resize_y_offset); + } + CONSTEXPR npu_set_resize_y_offset_t& set_resize_y_offset(uint32_t value) + { + assert((value >> 12) == 0); + resize_y_offset = static_cast(value & ((1U << 12)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("resize_y_offset", std::to_string(((resize_y_offset <= std::numeric_limits::max() ? static_cast(resize_y_offset) : resize_y_offset - std::numeric_limits::min() + std::numeric_limits::max()) << 20) >> 20))); + } +#endif +#endif +}; + +struct npu_set_dma0_src_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:5; + uint32_t region_mode:1; + uint32_t stride_mode:2; + uint32_t idx_mode:1; + uint32_t reserved2:4; +#ifdef __cplusplus +public: + npu_set_dma0_src_region_t(uint32_t _region, NPU_NAMESPACE::dma_region_mode _region_mode, NPU_NAMESPACE::dma_stride_mode _stride_mode, NPU_NAMESPACE::dma_idx_mode _idx_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0), + region_mode(static_cast(_region_mode) & ((1U << 1)-1)), + stride_mode(static_cast(_stride_mode) & ((1U << 2)-1)), + idx_mode(static_cast(_idx_mode) & ((1U << 1)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_dma0_src_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0), + region_mode(0), + stride_mode(0), + idx_mode(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SRC_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + word |= uint32_t(region_mode) << 24; + word |= uint32_t(stride_mode) << 25; + word |= uint32_t(idx_mode) << 27; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_dma0_src_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_dma0_src_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_dma0_src_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_region_mode get_region_mode() const + { + return static_cast(region_mode); + } + CONSTEXPR npu_set_dma0_src_region_t& set_region_mode(NPU_NAMESPACE::dma_region_mode value) + { + region_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_stride_mode get_stride_mode() const + { + return static_cast(stride_mode); + } + CONSTEXPR npu_set_dma0_src_region_t& set_stride_mode(NPU_NAMESPACE::dma_stride_mode value) + { + stride_mode = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_idx_mode get_idx_mode() const + { + return static_cast(idx_mode); + } + CONSTEXPR npu_set_dma0_src_region_t& set_idx_mode(NPU_NAMESPACE::dma_idx_mode value) + { + idx_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + fields.push_back(std::make_pair("region_mode", (region_mode < (sizeof(dma_region_mode_str)/sizeof(dma_region_mode_str[0])) ? dma_region_mode_str[region_mode] : "****"))); + fields.push_back(std::make_pair("stride_mode", (stride_mode < (sizeof(dma_stride_mode_str)/sizeof(dma_stride_mode_str[0])) ? dma_stride_mode_str[stride_mode] : "****"))); + fields.push_back(std::make_pair("idx_mode", (idx_mode < (sizeof(dma_idx_mode_str)/sizeof(dma_idx_mode_str[0])) ? dma_idx_mode_str[idx_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_dma0_dst_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:5; + uint32_t region_mode:1; + uint32_t reserved2:2; + uint32_t idx_mode:1; + uint32_t reserved3:4; +#ifdef __cplusplus +public: + npu_set_dma0_dst_region_t(uint32_t _region, NPU_NAMESPACE::dma_region_mode _region_mode, NPU_NAMESPACE::dma_idx_mode _idx_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0), + region_mode(static_cast(_region_mode) & ((1U << 1)-1)), + reserved2(0), + idx_mode(static_cast(_idx_mode) & ((1U << 1)-1)), + reserved3(0) + {} + CONSTEXPR npu_set_dma0_dst_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0), + region_mode(0), + reserved2(0), + idx_mode(0), + reserved3(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_DST_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + word |= uint32_t(region_mode) << 24; + word |= uint32_t(idx_mode) << 27; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_dma0_dst_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_dma0_dst_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_dma0_dst_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_region_mode get_region_mode() const + { + return static_cast(region_mode); + } + CONSTEXPR npu_set_dma0_dst_region_t& set_region_mode(NPU_NAMESPACE::dma_region_mode value) + { + region_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::dma_idx_mode get_idx_mode() const + { + return static_cast(idx_mode); + } + CONSTEXPR npu_set_dma0_dst_region_t& set_idx_mode(NPU_NAMESPACE::dma_idx_mode value) + { + idx_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + fields.push_back(std::make_pair("region_mode", (region_mode < (sizeof(dma_region_mode_str)/sizeof(dma_region_mode_str[0])) ? dma_region_mode_str[region_mode] : "****"))); + fields.push_back(std::make_pair("idx_mode", (idx_mode < (sizeof(dma_idx_mode_str)/sizeof(dma_idx_mode_str[0])) ? dma_idx_mode_str[idx_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_dma0_size0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t size:16; +#ifdef __cplusplus +public: + npu_set_dma0_size0_t(uint32_t _size) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + size(_size & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_dma0_size0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + size(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(size) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_dma0_size0_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_dma0_size0_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_size() const + { + return static_cast(size); + } + CONSTEXPR npu_set_dma0_size0_t& set_size(uint32_t value) + { + assert((value >> 16) == 0); + size = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("size", std::to_string(size))); + } +#endif +#endif +}; + +struct npu_set_dma0_size1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t size:16; +#ifdef __cplusplus +public: + npu_set_dma0_size1_t(uint32_t _size) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + size(_size & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_dma0_size1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + size(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_SIZE1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(size) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_dma0_size1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_dma0_size1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_size() const + { + return static_cast(size); + } + CONSTEXPR npu_set_dma0_size1_t& set_size(uint32_t value) + { + assert((value >> 16) == 0); + size = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("size", std::to_string(size))); + } +#endif +#endif +}; + +struct npu_set_dma0_idx_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_dma0_idx_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_IDX_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_dma0_idx_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_IDX_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_IDX_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_DMA0_IDX_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_dma0_idx_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_dma0_idx_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_dma0_idx_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_ifm2_broadcast_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t broadcast_mode:4; + uint32_t reserved1:12; +#ifdef __cplusplus +public: + npu_set_ifm2_broadcast_t(NPU_NAMESPACE::broadcast_mode _broadcast_mode) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + broadcast_mode(static_cast(_broadcast_mode) & ((1U << 4)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm2_broadcast_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + broadcast_mode(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_BROADCAST); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(broadcast_mode) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_broadcast_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_broadcast_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::broadcast_mode get_broadcast_mode() const + { + return static_cast(broadcast_mode); + } + CONSTEXPR npu_set_ifm2_broadcast_t& set_broadcast_mode(NPU_NAMESPACE::broadcast_mode value) + { + broadcast_mode = static_cast(value) & ((1U << 4)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("broadcast_mode", (broadcast_mode < (sizeof(broadcast_mode_str)/sizeof(broadcast_mode_str[0])) ? broadcast_mode_str[broadcast_mode] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ifm2_precision_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t activation_type:1; + uint32_t reserved1:1; + uint32_t activation_precision:2; + uint32_t reserved2:2; + uint32_t activation_format:2; + uint32_t reserved3:6; + uint32_t activation_storage:2; +#ifdef __cplusplus +public: + npu_set_ifm2_precision_t(NPU_NAMESPACE::activation_type _activation_type, NPU_NAMESPACE::activation_precision _activation_precision, NPU_NAMESPACE::activation_format _activation_format, NPU_NAMESPACE::activation_storage _activation_storage) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(static_cast(_activation_type) & ((1U << 1)-1)), + reserved1(0), + activation_precision(static_cast(_activation_precision) & ((1U << 2)-1)), + reserved2(0), + activation_format(static_cast(_activation_format) & ((1U << 2)-1)), + reserved3(0), + activation_storage(static_cast(_activation_storage) & ((1U << 2)-1)) + {} + CONSTEXPR npu_set_ifm2_precision_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + activation_type(0), + reserved1(0), + activation_precision(0), + reserved2(0), + activation_format(0), + reserved3(0), + activation_storage(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_PRECISION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(activation_type) << 16; + word |= uint32_t(activation_precision) << 18; + word |= uint32_t(activation_format) << 22; + word |= uint32_t(activation_storage) << 30; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_precision_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_precision_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_type get_activation_type() const + { + return static_cast(activation_type); + } + CONSTEXPR npu_set_ifm2_precision_t& set_activation_type(NPU_NAMESPACE::activation_type value) + { + activation_type = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_precision get_activation_precision() const + { + return static_cast(activation_precision); + } + CONSTEXPR npu_set_ifm2_precision_t& set_activation_precision(NPU_NAMESPACE::activation_precision value) + { + activation_precision = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_format get_activation_format() const + { + return static_cast(activation_format); + } + CONSTEXPR npu_set_ifm2_precision_t& set_activation_format(NPU_NAMESPACE::activation_format value) + { + activation_format = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::activation_storage get_activation_storage() const + { + return static_cast(activation_storage); + } + CONSTEXPR npu_set_ifm2_precision_t& set_activation_storage(NPU_NAMESPACE::activation_storage value) + { + activation_storage = static_cast(value) & ((1U << 2)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("activation_type", (activation_type < (sizeof(activation_type_str)/sizeof(activation_type_str[0])) ? activation_type_str[activation_type] : "****"))); + fields.push_back(std::make_pair("activation_precision", (activation_precision < (sizeof(activation_precision_str)/sizeof(activation_precision_str[0])) ? activation_precision_str[activation_precision] : "****"))); + fields.push_back(std::make_pair("activation_format", (activation_format < (sizeof(activation_format_str)/sizeof(activation_format_str[0])) ? activation_format_str[activation_format] : "****"))); + fields.push_back(std::make_pair("activation_storage", (activation_storage < (sizeof(activation_storage_str)/sizeof(activation_storage_str[0])) ? activation_storage_str[activation_storage] : "****"))); + } +#endif +#endif +}; + +struct npu_set_ifm2_zero_point_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t zero_point:16; +#ifdef __cplusplus +public: + npu_set_ifm2_zero_point_t(uint32_t _zero_point) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(_zero_point & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm2_zero_point_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + zero_point(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_ZERO_POINT); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(zero_point) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_zero_point_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_zero_point_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_zero_point() const + { + return static_cast(zero_point); + } + CONSTEXPR npu_set_ifm2_zero_point_t& set_zero_point(uint32_t value) + { + assert((value >> 16) == 0); + zero_point = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("zero_point", std::to_string(zero_point))); + } +#endif +#endif +}; + +struct npu_set_ifm2_width0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t width_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm2_width0_m1_t(uint32_t _width_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(_width_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm2_width0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + width_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_WIDTH0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(width_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_width0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_width0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_width_m1() const + { + return static_cast(width_m1); + } + CONSTEXPR npu_set_ifm2_width0_m1_t& set_width_m1(uint32_t value) + { + assert((value >> 16) == 0); + width_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("width_m1", std::to_string(width_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm2_height0_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm2_height0_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm2_height0_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT0_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_height0_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_height0_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ifm2_height0_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm2_height1_m1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t height_m1:16; +#ifdef __cplusplus +public: + npu_set_ifm2_height1_m1_t(uint32_t _height_m1) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(_height_m1 & ((1U << 16)-1)) + {} + CONSTEXPR npu_set_ifm2_height1_m1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + height_m1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_HEIGHT1_M1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(height_m1) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_height1_m1_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_height1_m1_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_height_m1() const + { + return static_cast(height_m1); + } + CONSTEXPR npu_set_ifm2_height1_m1_t& set_height_m1(uint32_t value) + { + assert((value >> 16) == 0); + height_m1 = static_cast(value & ((1U << 16)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("height_m1", std::to_string(height_m1))); + } +#endif +#endif +}; + +struct npu_set_ifm2_region_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t region:3; + uint32_t reserved1:13; +#ifdef __cplusplus +public: + npu_set_ifm2_region_t(uint32_t _region) : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(_region & ((1U << 3)-1)), + reserved1(0) + {} + CONSTEXPR npu_set_ifm2_region_t() : + opcode(static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL)), + region(0), + reserved1(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION) && control == static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd0_opcode::NPU_SET_IFM2_REGION); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD0_CTRL); + } + operator uint32_t() + { + uint32_t word = 0; + word |= uint32_t(opcode) << 0; + word |= uint32_t(control) << 14; + word |= uint32_t(region) << 16; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd0_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_region_t& set_opcode(NPU_NAMESPACE::cmd0_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_region_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_region() const + { + return static_cast(region); + } + CONSTEXPR npu_set_ifm2_region_t& set_region(uint32_t value) + { + assert((value >> 3) == 0); + region = static_cast(value & ((1U << 3)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("region", std::to_string(region))); + } +#endif +#endif +}; + +struct npu_set_ifm_base0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_base0_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_base0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_base0_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_base1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_base1_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_base1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_base1_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_base2_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_base2_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_base2_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE2); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_base2_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_base3_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_base3_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_base3_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_BASE3); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_base3_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_stride_x_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_stride_x_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_stride_x_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_X); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_stride_x_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_stride_y_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_stride_y_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_stride_y_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_Y); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_stride_y_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm_stride_c_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm_stride_c_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm_stride_c_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_STRIDE_C); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm_stride_c_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_base0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_base0_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_base0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_base0_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_base1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_base1_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_base1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_base1_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_base2_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_base2_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_base2_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE2); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_base2_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_base3_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_base3_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_base3_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_BASE3); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_base3_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_stride_x_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_stride_x_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_stride_x_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_X); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_stride_x_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_stride_y_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_stride_y_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_stride_y_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_Y); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_stride_y_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ofm_stride_c_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ofm_stride_c_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ofm_stride_c_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_STRIDE_C); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ofm_stride_c_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_weight_base_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_weight_base_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_weight_base_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_BASE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_weight_base_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_weight_length_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t length:32; +#ifdef __cplusplus +public: + npu_set_weight_length_t(uint32_t _length) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(_length) + {} + CONSTEXPR npu_set_weight_length_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT_LENGTH); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(length) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_weight_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_weight_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_length() const + { + return static_cast(length); + } + CONSTEXPR npu_set_weight_length_t& set_length(uint32_t value) + { + length = value; + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("length", std::to_string(length))); + } +#endif +#endif +}; + +struct npu_set_scale_base_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_scale_base_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_scale_base_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_BASE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_scale_base_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_scale_length_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t length:20; + uint32_t reserved2:12; +#ifdef __cplusplus +public: + npu_set_scale_length_t(uint32_t _length) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(_length & ((1U << 20)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_scale_length_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_SCALE_LENGTH); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(length) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_scale_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_scale_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_length() const + { + return static_cast(length); + } + CONSTEXPR npu_set_scale_length_t& set_length(uint32_t value) + { + assert((value >> 20) == 0); + length = value & ((1U << 20)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("length", std::to_string(length))); + } +#endif +#endif +}; + +struct npu_set_ofm_scale_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t shift:6; + uint32_t dbl_rnd:5; + uint32_t reserved1:2; + uint32_t round_mode:3; + uint32_t scale:31; + uint32_t reserved2:1; +#ifdef __cplusplus +public: + npu_set_ofm_scale_t(uint32_t _shift, uint32_t _dbl_rnd, NPU_NAMESPACE::round_mode_ofm _round_mode, uint32_t _scale) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + shift(_shift & ((1U << 6)-1)), + dbl_rnd(_dbl_rnd & ((1U << 5)-1)), + reserved1(0), + round_mode(static_cast(_round_mode) & ((1U << 3)-1)), + scale(_scale & ((1U << 31)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_ofm_scale_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + shift(0), + dbl_rnd(0), + reserved1(0), + round_mode(0), + scale(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OFM_SCALE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(shift) << 16; + word |= uint64_t(dbl_rnd) << 22; + word |= uint64_t(round_mode) << 29; + word |= uint64_t(scale) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ofm_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ofm_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_shift() const + { + return static_cast(shift); + } + CONSTEXPR npu_set_ofm_scale_t& set_shift(uint32_t value) + { + assert((value >> 6) == 0); + shift = static_cast(value & ((1U << 6)-1)); + return *this; + } + CONSTEXPR uint32_t get_dbl_rnd() const + { + return static_cast(dbl_rnd); + } + CONSTEXPR npu_set_ofm_scale_t& set_dbl_rnd(uint32_t value) + { + assert((value >> 5) == 0); + dbl_rnd = static_cast(value & ((1U << 5)-1)); + return *this; + } + CONSTEXPR NPU_NAMESPACE::round_mode_ofm get_round_mode() const + { + return static_cast(round_mode); + } + CONSTEXPR npu_set_ofm_scale_t& set_round_mode(NPU_NAMESPACE::round_mode_ofm value) + { + round_mode = static_cast(value) & ((1U << 3)-1); + return *this; + } + CONSTEXPR uint32_t get_scale() const + { + return static_cast(scale); + } + CONSTEXPR npu_set_ofm_scale_t& set_scale(uint32_t value) + { + assert((value >> 31) == 0); + scale = value & ((1U << 31)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("shift", std::to_string(shift))); + fields.push_back(std::make_pair("dbl_rnd", std::to_string(dbl_rnd))); + fields.push_back(std::make_pair("round_mode", (round_mode < (sizeof(round_mode_ofm_str)/sizeof(round_mode_ofm_str[0])) ? round_mode_ofm_str[round_mode] : "****"))); + fields.push_back(std::make_pair("scale", std::to_string(scale))); + } +#endif +#endif +}; + +struct npu_set_ifm_scale_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t shift:6; + uint32_t dbl_rnd:5; + uint32_t reserved1:2; + uint32_t round_mode:1; + uint32_t reserved2:2; + uint32_t scale:31; + uint32_t reserved3:1; +#ifdef __cplusplus +public: + npu_set_ifm_scale_t(uint32_t _shift, uint32_t _dbl_rnd, NPU_NAMESPACE::round_mode_ifm _round_mode, uint32_t _scale) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + shift(_shift & ((1U << 6)-1)), + dbl_rnd(_dbl_rnd & ((1U << 5)-1)), + reserved1(0), + round_mode(static_cast(_round_mode) & ((1U << 1)-1)), + reserved2(0), + scale(_scale & ((1U << 31)-1)), + reserved3(0) + {} + CONSTEXPR npu_set_ifm_scale_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + shift(0), + dbl_rnd(0), + reserved1(0), + round_mode(0), + reserved2(0), + scale(0), + reserved3(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_SCALE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM_SCALE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(shift) << 16; + word |= uint64_t(dbl_rnd) << 22; + word |= uint64_t(round_mode) << 29; + word |= uint64_t(scale) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_shift() const + { + return static_cast(shift); + } + CONSTEXPR npu_set_ifm_scale_t& set_shift(uint32_t value) + { + assert((value >> 6) == 0); + shift = static_cast(value & ((1U << 6)-1)); + return *this; + } + CONSTEXPR uint32_t get_dbl_rnd() const + { + return static_cast(dbl_rnd); + } + CONSTEXPR npu_set_ifm_scale_t& set_dbl_rnd(uint32_t value) + { + assert((value >> 5) == 0); + dbl_rnd = static_cast(value & ((1U << 5)-1)); + return *this; + } + CONSTEXPR NPU_NAMESPACE::round_mode_ifm get_round_mode() const + { + return static_cast(round_mode); + } + CONSTEXPR npu_set_ifm_scale_t& set_round_mode(NPU_NAMESPACE::round_mode_ifm value) + { + round_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR uint32_t get_scale() const + { + return static_cast(scale); + } + CONSTEXPR npu_set_ifm_scale_t& set_scale(uint32_t value) + { + assert((value >> 31) == 0); + scale = value & ((1U << 31)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("shift", std::to_string(shift))); + fields.push_back(std::make_pair("dbl_rnd", std::to_string(dbl_rnd))); + fields.push_back(std::make_pair("round_mode", (round_mode < (sizeof(round_mode_ifm_str)/sizeof(round_mode_ifm_str[0])) ? round_mode_ifm_str[round_mode] : "****"))); + fields.push_back(std::make_pair("scale", std::to_string(scale))); + } +#endif +#endif +}; + +struct npu_set_ifm2_scale_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t shift:6; + uint32_t dbl_rnd:5; + uint32_t reserved1:2; + uint32_t round_mode:1; + uint32_t reserved2:2; + uint32_t scale:31; + uint32_t reserved3:1; +#ifdef __cplusplus +public: + npu_set_ifm2_scale_t(uint32_t _shift, uint32_t _dbl_rnd, NPU_NAMESPACE::round_mode_ifm _round_mode, uint32_t _scale) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + shift(_shift & ((1U << 6)-1)), + dbl_rnd(_dbl_rnd & ((1U << 5)-1)), + reserved1(0), + round_mode(static_cast(_round_mode) & ((1U << 1)-1)), + reserved2(0), + scale(_scale & ((1U << 31)-1)), + reserved3(0) + {} + CONSTEXPR npu_set_ifm2_scale_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_SCALE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + shift(0), + dbl_rnd(0), + reserved1(0), + round_mode(0), + reserved2(0), + scale(0), + reserved3(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_SCALE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_SCALE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(shift) << 16; + word |= uint64_t(dbl_rnd) << 22; + word |= uint64_t(round_mode) << 29; + word |= uint64_t(scale) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_ifm2_scale_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_ifm2_scale_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_shift() const + { + return static_cast(shift); + } + CONSTEXPR npu_set_ifm2_scale_t& set_shift(uint32_t value) + { + assert((value >> 6) == 0); + shift = static_cast(value & ((1U << 6)-1)); + return *this; + } + CONSTEXPR uint32_t get_dbl_rnd() const + { + return static_cast(dbl_rnd); + } + CONSTEXPR npu_set_ifm2_scale_t& set_dbl_rnd(uint32_t value) + { + assert((value >> 5) == 0); + dbl_rnd = static_cast(value & ((1U << 5)-1)); + return *this; + } + CONSTEXPR NPU_NAMESPACE::round_mode_ifm get_round_mode() const + { + return static_cast(round_mode); + } + CONSTEXPR npu_set_ifm2_scale_t& set_round_mode(NPU_NAMESPACE::round_mode_ifm value) + { + round_mode = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR uint32_t get_scale() const + { + return static_cast(scale); + } + CONSTEXPR npu_set_ifm2_scale_t& set_scale(uint32_t value) + { + assert((value >> 31) == 0); + scale = value & ((1U << 31)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("shift", std::to_string(shift))); + fields.push_back(std::make_pair("dbl_rnd", std::to_string(dbl_rnd))); + fields.push_back(std::make_pair("round_mode", (round_mode < (sizeof(round_mode_ifm_str)/sizeof(round_mode_ifm_str[0])) ? round_mode_ifm_str[round_mode] : "****"))); + fields.push_back(std::make_pair("scale", std::to_string(scale))); + } +#endif +#endif +}; + +struct npu_set_op_scalar_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t scalar:32; +#ifdef __cplusplus +public: + npu_set_op_scalar_t(uint32_t _scalar) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OP_SCALAR)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + scalar(_scalar) + {} + CONSTEXPR npu_set_op_scalar_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OP_SCALAR)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + scalar(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OP_SCALAR) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_OP_SCALAR); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(scalar) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_op_scalar_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_op_scalar_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_scalar() const + { + return static_cast(scalar); + } + CONSTEXPR npu_set_op_scalar_t& set_scalar(uint32_t value) + { + scalar = value; + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("scalar", std::to_string(((scalar <= std::numeric_limits::max() ? static_cast(scalar) : scalar - std::numeric_limits::min() + std::numeric_limits::max()) << 0) >> 0))); + } +#endif +#endif +}; + +struct npu_set_dma0_src_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_src_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_src_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_src_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_dst_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_dst_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_dst_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_dst_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_len_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_len_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_len_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_LEN); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_len_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_src_stride0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_src_stride0_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_src_stride0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE0) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_src_stride0_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_src_stride1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_src_stride1_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_src_stride1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE1) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_SRC_STRIDE1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_src_stride1_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_dst_stride0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_dst_stride0_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_dst_stride0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE0) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_dst_stride0_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_dst_stride1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_dst_stride1_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_dst_stride1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE1) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_DST_STRIDE1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_dst_stride1_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_idx_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_idx_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_idx_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_idx_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_dma0_idx_max_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t idx_max:31; + uint32_t reserved2:1; +#ifdef __cplusplus +public: + npu_set_dma0_idx_max_t(uint32_t _idx_max) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_MAX)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + idx_max(_idx_max & ((1U << 31)-1)), + reserved2(0) + {} + CONSTEXPR npu_set_dma0_idx_max_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_MAX)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + idx_max(0), + reserved2(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_MAX) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_MAX); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(idx_max) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_dma0_idx_max_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_dma0_idx_max_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_idx_max() const + { + return static_cast(idx_max); + } + CONSTEXPR npu_set_dma0_idx_max_t& set_idx_max(uint32_t value) + { + assert((value >> 31) == 0); + idx_max = value & ((1U << 31)-1); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("idx_max", std::to_string(idx_max))); + } +#endif +#endif +}; + +struct npu_set_dma0_idx_skip1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_dma0_idx_skip1_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_SKIP1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_dma0_idx_skip1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_SKIP1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_SKIP1) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_DMA0_IDX_SKIP1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_dma0_idx_skip1_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_base0_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_base0_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_base0_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE0); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_base0_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_base1_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_base1_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_base1_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE1); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_base1_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_base2_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_base2_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_base2_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE2); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_base2_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_base3_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_base3_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_base3_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_BASE3); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_base3_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_stride_x_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_stride_x_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_stride_x_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_X); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_stride_x_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_stride_y_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_stride_y_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_stride_y_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_Y); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_stride_y_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_ifm2_stride_c_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_ifm2_stride_c_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_ifm2_stride_c_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_IFM2_STRIDE_C); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_ifm2_stride_c_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_weight1_base_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_weight1_base_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_weight1_base_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_BASE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_weight1_base_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_weight1_length_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t length:32; +#ifdef __cplusplus +public: + npu_set_weight1_length_t(uint32_t _length) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(_length) + {} + CONSTEXPR npu_set_weight1_length_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT1_LENGTH); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(length) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_weight1_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_weight1_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_length() const + { + return static_cast(length); + } + CONSTEXPR npu_set_weight1_length_t& set_length(uint32_t value) + { + length = value; + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("length", std::to_string(length))); + } +#endif +#endif +}; + +struct npu_set_weight2_base_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_weight2_base_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_weight2_base_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_BASE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_BASE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_weight2_base_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_weight2_length_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t length:32; +#ifdef __cplusplus +public: + npu_set_weight2_length_t(uint32_t _length) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(_length) + {} + CONSTEXPR npu_set_weight2_length_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_LENGTH) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT2_LENGTH); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(length) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_weight2_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_weight2_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_length() const + { + return static_cast(length); + } + CONSTEXPR npu_set_weight2_length_t& set_length(uint32_t value) + { + length = value; + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("length", std::to_string(length))); + } +#endif +#endif +}; + +struct npu_set_weight3_base_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t addr_hi:8; + uint32_t reserved1:8; + uint32_t addr_lo:32; +#ifdef __cplusplus +public: + npu_set_weight3_base_t(uint64_t _addr) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(static_cast((_addr >> 32) & static_cast(std::numeric_limits::max()))), + reserved1(0), + addr_lo(static_cast((_addr) & static_cast(std::numeric_limits::max()))) + {} + CONSTEXPR npu_set_weight3_base_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_BASE)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + addr_hi(0), + reserved1(0), + addr_lo(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_BASE) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_BASE); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(addr_hi) << 16; + word |= uint64_t(addr_lo) << 32; + return word; + } + CONSTEXPR uint64_t get_addr() const + { + return (static_cast(addr_hi) << 32) | addr_lo; + } + CONSTEXPR npu_set_weight3_base_t& set_addr(uint64_t value) + { + addr_lo = static_cast((value) & static_cast(std::numeric_limits::max())); addr_hi = static_cast((value >> 32) & static_cast(std::numeric_limits::max())); return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + std::stringstream saddr; saddr << std::hex << "0x" << get_addr(); + fields.push_back(std::make_pair("addr", saddr.str())); + } +#endif +#endif +}; + +struct npu_set_weight3_length_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t reserved1:16; + uint32_t length:32; +#ifdef __cplusplus +public: + npu_set_weight3_length_t(uint32_t _length) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(_length) + {} + CONSTEXPR npu_set_weight3_length_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_LENGTH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + reserved1(0), + length(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_LENGTH) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_WEIGHT3_LENGTH); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(length) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_weight3_length_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_weight3_length_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_length() const + { + return static_cast(length); + } + CONSTEXPR npu_set_weight3_length_t& set_length(uint32_t value) + { + length = value; + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("length", std::to_string(length))); + } +#endif +#endif +}; + +struct npu_set_resize_x_step_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t one_step_int:4; + uint32_t blk_step_int:11; + uint32_t reserved1:1; + uint32_t one_step_mod:11; + uint32_t reserved2:5; + uint32_t blk_step_mod:11; + uint32_t reserved3:5; +#ifdef __cplusplus +public: + npu_set_resize_x_step_t(uint32_t _one_step_int, uint32_t _blk_step_int, uint32_t _one_step_mod, uint32_t _blk_step_mod) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + one_step_int(_one_step_int & ((1U << 4)-1)), + blk_step_int(_blk_step_int & ((1U << 11)-1)), + reserved1(0), + one_step_mod(_one_step_mod & ((1U << 11)-1)), + reserved2(0), + blk_step_mod(_blk_step_mod & ((1U << 11)-1)), + reserved3(0) + {} + CONSTEXPR npu_set_resize_x_step_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_X)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + one_step_int(0), + blk_step_int(0), + reserved1(0), + one_step_mod(0), + reserved2(0), + blk_step_mod(0), + reserved3(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_X) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_X); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(one_step_int) << 16; + word |= uint64_t(blk_step_int) << 20; + word |= uint64_t(one_step_mod) << 32; + word |= uint64_t(blk_step_mod) << 48; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_resize_x_step_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_resize_x_step_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_one_step_int() const + { + return static_cast(one_step_int); + } + CONSTEXPR npu_set_resize_x_step_t& set_one_step_int(uint32_t value) + { + assert((value >> 4) == 0); + one_step_int = static_cast(value & ((1U << 4)-1)); + return *this; + } + CONSTEXPR uint32_t get_blk_step_int() const + { + return static_cast(blk_step_int); + } + CONSTEXPR npu_set_resize_x_step_t& set_blk_step_int(uint32_t value) + { + assert((value >> 11) == 0); + blk_step_int = static_cast(value & ((1U << 11)-1)); + return *this; + } + CONSTEXPR uint32_t get_one_step_mod() const + { + return static_cast(one_step_mod); + } + CONSTEXPR npu_set_resize_x_step_t& set_one_step_mod(uint32_t value) + { + assert((value >> 11) == 0); + one_step_mod = static_cast(value & ((1U << 11)-1)); + return *this; + } + CONSTEXPR uint32_t get_blk_step_mod() const + { + return static_cast(blk_step_mod); + } + CONSTEXPR npu_set_resize_x_step_t& set_blk_step_mod(uint32_t value) + { + assert((value >> 11) == 0); + blk_step_mod = static_cast(value & ((1U << 11)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("one_step_int", std::to_string(one_step_int))); + fields.push_back(std::make_pair("blk_step_int", std::to_string(blk_step_int))); + fields.push_back(std::make_pair("one_step_mod", std::to_string(one_step_mod))); + fields.push_back(std::make_pair("blk_step_mod", std::to_string(blk_step_mod))); + } +#endif +#endif +}; + +struct npu_set_resize_y_step_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t one_step_int:4; + uint32_t blk_step_int:11; + uint32_t reserved1:1; + uint32_t one_step_mod:11; + uint32_t reserved2:5; + uint32_t blk_step_mod:11; + uint32_t reserved3:5; +#ifdef __cplusplus +public: + npu_set_resize_y_step_t(uint32_t _one_step_int, uint32_t _blk_step_int, uint32_t _one_step_mod, uint32_t _blk_step_mod) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + one_step_int(_one_step_int & ((1U << 4)-1)), + blk_step_int(_blk_step_int & ((1U << 11)-1)), + reserved1(0), + one_step_mod(_one_step_mod & ((1U << 11)-1)), + reserved2(0), + blk_step_mod(_blk_step_mod & ((1U << 11)-1)), + reserved3(0) + {} + CONSTEXPR npu_set_resize_y_step_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_Y)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + one_step_int(0), + blk_step_int(0), + reserved1(0), + one_step_mod(0), + reserved2(0), + blk_step_mod(0), + reserved3(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_Y) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_SET_RESIZE_Y); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(one_step_int) << 16; + word |= uint64_t(blk_step_int) << 20; + word |= uint64_t(one_step_mod) << 32; + word |= uint64_t(blk_step_mod) << 48; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_set_resize_y_step_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_set_resize_y_step_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR uint32_t get_one_step_int() const + { + return static_cast(one_step_int); + } + CONSTEXPR npu_set_resize_y_step_t& set_one_step_int(uint32_t value) + { + assert((value >> 4) == 0); + one_step_int = static_cast(value & ((1U << 4)-1)); + return *this; + } + CONSTEXPR uint32_t get_blk_step_int() const + { + return static_cast(blk_step_int); + } + CONSTEXPR npu_set_resize_y_step_t& set_blk_step_int(uint32_t value) + { + assert((value >> 11) == 0); + blk_step_int = static_cast(value & ((1U << 11)-1)); + return *this; + } + CONSTEXPR uint32_t get_one_step_mod() const + { + return static_cast(one_step_mod); + } + CONSTEXPR npu_set_resize_y_step_t& set_one_step_mod(uint32_t value) + { + assert((value >> 11) == 0); + one_step_mod = static_cast(value & ((1U << 11)-1)); + return *this; + } + CONSTEXPR uint32_t get_blk_step_mod() const + { + return static_cast(blk_step_mod); + } + CONSTEXPR npu_set_resize_y_step_t& set_blk_step_mod(uint32_t value) + { + assert((value >> 11) == 0); + blk_step_mod = static_cast(value & ((1U << 11)-1)); + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("one_step_int", std::to_string(one_step_int))); + fields.push_back(std::make_pair("blk_step_int", std::to_string(blk_step_int))); + fields.push_back(std::make_pair("one_step_mod", std::to_string(one_step_mod))); + fields.push_back(std::make_pair("blk_step_mod", std::to_string(blk_step_mod))); + } +#endif +#endif +}; + +struct npu_op_branch_t +{ +#ifdef __cplusplus +private: +#endif + uint32_t opcode:10; + uint32_t reserved0:4; + uint32_t control:2; + uint32_t branch_cond:1; + uint32_t reserved1:15; + uint32_t branch_target:32; +#ifdef __cplusplus +public: + npu_op_branch_t(NPU_NAMESPACE::branch_cond _branch_cond, uint32_t _branch_target) : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_OP_BRANCH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + branch_cond(static_cast(_branch_cond) & ((1U << 1)-1)), + reserved1(0), + branch_target(_branch_target) + {} + CONSTEXPR npu_op_branch_t() : + opcode(static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_OP_BRANCH)), + reserved0(0), + control(static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL)), + branch_cond(0), + reserved1(0), + branch_target(0) + {} + CONSTEXPR bool valid() const + { + return opcode == static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_OP_BRANCH) && control >= 1 && control <= 2; + } + CONSTEXPR void init() + { + opcode = static_cast(NPU_NAMESPACE::cmd1_opcode::NPU_OP_BRANCH); control = static_cast(NPU_NAMESPACE::cmd_ctrl::CMD1_CTRL); + } + operator uint64_t() + { + uint64_t word = 0; + word |= uint64_t(opcode) << 0; + word |= uint64_t(control) << 14; + word |= uint64_t(branch_cond) << 16; + word |= uint64_t(branch_target) << 32; + return word; + } + CONSTEXPR NPU_NAMESPACE::cmd1_opcode get_opcode() const + { + return static_cast(opcode); + } + CONSTEXPR npu_op_branch_t& set_opcode(NPU_NAMESPACE::cmd1_opcode value) + { + opcode = static_cast(value) & ((1U << 10)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::cmd_ctrl get_control() const + { + return static_cast(control); + } + CONSTEXPR npu_op_branch_t& set_control(NPU_NAMESPACE::cmd_ctrl value) + { + control = static_cast(value) & ((1U << 2)-1); + return *this; + } + CONSTEXPR NPU_NAMESPACE::branch_cond get_branch_cond() const + { + return static_cast(branch_cond); + } + CONSTEXPR npu_op_branch_t& set_branch_cond(NPU_NAMESPACE::branch_cond value) + { + branch_cond = static_cast(value) & ((1U << 1)-1); + return *this; + } + CONSTEXPR uint32_t get_branch_target() const + { + return static_cast(branch_target); + } + CONSTEXPR npu_op_branch_t& set_branch_target(uint32_t value) + { + branch_target = value; + return *this; + } +#ifdef NPU_DISASSEMBLE + void disassemble(std::vector>& fields) const + { + fields.push_back(std::make_pair("branch_cond", (branch_cond < (sizeof(branch_cond_str)/sizeof(branch_cond_str[0])) ? branch_cond_str[branch_cond] : "****"))); + fields.push_back(std::make_pair("branch_target", std::to_string(branch_target))); + } +#endif +#endif +}; +#ifdef __cplusplus +}; +#endif +#define NPU_OP_STRUCTS \ + NPU_OP_(stop) \ + NPU_OP_(irq) \ + NPU_OP_(conv) \ + NPU_OP_(depthwise) \ + NPU_OP_(pool) \ + NPU_OP_(elementwise) \ + NPU_OP_(resize) \ + NPU_OP_(dma_start) \ + NPU_OP_(dma_wait) \ + NPU_OP_(kernel_wait) \ + NPU_OP_(pmu_mask) \ + NPU_OP_(branch) + +#define NPU_SET_STRUCTS \ + NPU_SET_(ifm_pad_top) \ + NPU_SET_(ifm_pad_left) \ + NPU_SET_(ifm_pad_right) \ + NPU_SET_(ifm_pad_bottom) \ + NPU_SET_(ifm_depth_m1) \ + NPU_SET_(ifm_precision) \ + NPU_SET_(ifm_upscale) \ + NPU_SET_(ifm_zero_point) \ + NPU_SET_(ifm_width0_m1) \ + NPU_SET_(ifm_height0_m1) \ + NPU_SET_(ifm_height1_m1) \ + NPU_SET_(ifm_region) \ + NPU_SET_(ifm_broadcast) \ + NPU_SET_(ofm_width_m1) \ + NPU_SET_(ofm_height_m1) \ + NPU_SET_(ofm_depth_m1) \ + NPU_SET_(ofm_precision) \ + NPU_SET_(ofm_blk_width_m1) \ + NPU_SET_(ofm_blk_height_m1) \ + NPU_SET_(ofm_blk_depth_m1) \ + NPU_SET_(ofm_zero_point) \ + NPU_SET_(ofm_width0_m1) \ + NPU_SET_(ofm_height0_m1) \ + NPU_SET_(ofm_height1_m1) \ + NPU_SET_(ofm_region) \ + NPU_SET_(kernel_width_m1) \ + NPU_SET_(kernel_height_m1) \ + NPU_SET_(kernel_stride) \ + NPU_SET_(acc_format) \ + NPU_SET_(activation) \ + NPU_SET_(activation_min) \ + NPU_SET_(activation_max) \ + NPU_SET_(weight_region) \ + NPU_SET_(scale_region) \ + NPU_SET_(weight_format) \ + NPU_SET_(blockdep) \ + NPU_SET_(resize_x_scale_n_m1) \ + NPU_SET_(resize_y_scale_n_m1) \ + NPU_SET_(resize_x_offset) \ + NPU_SET_(resize_y_offset) \ + NPU_SET_(dma0_src_region) \ + NPU_SET_(dma0_dst_region) \ + NPU_SET_(dma0_size0) \ + NPU_SET_(dma0_size1) \ + NPU_SET_(dma0_idx_region) \ + NPU_SET_(ifm2_broadcast) \ + NPU_SET_(ifm2_precision) \ + NPU_SET_(ifm2_zero_point) \ + NPU_SET_(ifm2_width0_m1) \ + NPU_SET_(ifm2_height0_m1) \ + NPU_SET_(ifm2_height1_m1) \ + NPU_SET_(ifm2_region) \ + NPU_SET_(ifm_base0) \ + NPU_SET_(ifm_base1) \ + NPU_SET_(ifm_base2) \ + NPU_SET_(ifm_base3) \ + NPU_SET_(ifm_stride_x) \ + NPU_SET_(ifm_stride_y) \ + NPU_SET_(ifm_stride_c) \ + NPU_SET_(ofm_base0) \ + NPU_SET_(ofm_base1) \ + NPU_SET_(ofm_base2) \ + NPU_SET_(ofm_base3) \ + NPU_SET_(ofm_stride_x) \ + NPU_SET_(ofm_stride_y) \ + NPU_SET_(ofm_stride_c) \ + NPU_SET_(weight_base) \ + NPU_SET_(weight_length) \ + NPU_SET_(scale_base) \ + NPU_SET_(scale_length) \ + NPU_SET_(ofm_scale) \ + NPU_SET_(ifm_scale) \ + NPU_SET_(ifm2_scale) \ + NPU_SET_(op_scalar) \ + NPU_SET_(dma0_src) \ + NPU_SET_(dma0_dst) \ + NPU_SET_(dma0_len) \ + NPU_SET_(dma0_src_stride0) \ + NPU_SET_(dma0_src_stride1) \ + NPU_SET_(dma0_dst_stride0) \ + NPU_SET_(dma0_dst_stride1) \ + NPU_SET_(dma0_idx) \ + NPU_SET_(dma0_idx_max) \ + NPU_SET_(dma0_idx_skip1) \ + NPU_SET_(ifm2_base0) \ + NPU_SET_(ifm2_base1) \ + NPU_SET_(ifm2_base2) \ + NPU_SET_(ifm2_base3) \ + NPU_SET_(ifm2_stride_x) \ + NPU_SET_(ifm2_stride_y) \ + NPU_SET_(ifm2_stride_c) \ + NPU_SET_(weight1_base) \ + NPU_SET_(weight1_length) \ + NPU_SET_(weight2_base) \ + NPU_SET_(weight2_length) \ + NPU_SET_(weight3_base) \ + NPU_SET_(weight3_length) \ + NPU_SET_(resize_x_step) \ + NPU_SET_(resize_y_step) + +#define EXPAND_ACC_FORMAT(FUNC, SEP) \ + FUNC(acc_format, I32) SEP \ + FUNC(acc_format, I48) + +#define EXPAND_ACC_INPUT(FUNC, SEP) \ + FUNC(acc_input, RESET) SEP \ + FUNC(acc_input, KEEP) SEP \ + FUNC(acc_input, IFM2) + +#define EXPAND_ACC_OUTPUT(FUNC, SEP) \ + FUNC(acc_output, ENABLE) SEP \ + FUNC(acc_output, DISABLE) + +#define EXPAND_ACTIVATION_CLIP_RANGE(FUNC, SEP) \ + FUNC(activation_clip_range, B16) SEP \ + FUNC(activation_clip_range, NONE) + +#define EXPAND_ACTIVATION_FORMAT(FUNC, SEP) \ + FUNC(activation_format, NHWC) SEP \ + FUNC(activation_format, NHCWB16) + +#define EXPAND_ACTIVATION_FUNCTION(FUNC, SEP) \ + FUNC(activation_function, LUT_NONE) SEP \ + FUNC(activation_function, LUT_U8_U8) SEP \ + FUNC(activation_function, LUT_S8_S8) SEP \ + FUNC(activation_function, LUT_S8_S16) SEP \ + FUNC(activation_function, LUT_S8_S32) SEP \ + FUNC(activation_function, LUT_S16_S16) SEP \ + FUNC(activation_function, LUT_S16_S32) SEP \ + FUNC(activation_function, LUT_TANH) SEP \ + FUNC(activation_function, LUT_SIGMOID) + +#define EXPAND_ACTIVATION_PRECISION(FUNC, SEP) \ + FUNC(activation_precision, B8) SEP \ + FUNC(activation_precision, B16) SEP \ + FUNC(activation_precision, B32) SEP \ + FUNC(activation_precision, B64) + +#define EXPAND_ACTIVATION_REVERSE(FUNC, SEP) \ + FUNC(activation_reverse, NONE) SEP \ + FUNC(activation_reverse, H) SEP \ + FUNC(activation_reverse, W) SEP \ + FUNC(activation_reverse, C) + +#define EXPAND_ACTIVATION_STORAGE(FUNC, SEP) \ + FUNC(activation_storage, TILE2X2) SEP \ + FUNC(activation_storage, TILE3X1) SEP \ + FUNC(activation_storage, CHAINED) SEP \ + FUNC(activation_storage, NONE) + +#define EXPAND_ACTIVATION_TRANSPOSE(FUNC, SEP) \ + FUNC(activation_transpose, HWC) SEP \ + FUNC(activation_transpose, WHC) SEP \ + FUNC(activation_transpose, HCW) SEP \ + FUNC(activation_transpose, WCH) SEP \ + FUNC(activation_transpose, CHW) SEP \ + FUNC(activation_transpose, CWH) + +#define EXPAND_ACTIVATION_TYPE(FUNC, SEP) \ + FUNC(activation_type, UNSIGNED) SEP \ + FUNC(activation_type, SIGNED) + +#define EXPAND_AXI_MEM_DOMAIN(FUNC, SEP) \ + FUNC(axi_mem_domain, NON_SHARABLE) SEP \ + FUNC(axi_mem_domain, INNER_SHARABLE) SEP \ + FUNC(axi_mem_domain, OUTER_SHARABLE) SEP \ + FUNC(axi_mem_domain, SYSTEM) + +#define EXPAND_AXI_MEM_ENCODING(FUNC, SEP) \ + FUNC(axi_mem_encoding, DEVICE_NON_BUFFERABLE) SEP \ + FUNC(axi_mem_encoding, DEVICE_BUFFERABLE) SEP \ + FUNC(axi_mem_encoding, NORMAL_NON_CACHEABLE_NON_BUFFERABLE) SEP \ + FUNC(axi_mem_encoding, NORMAL_NON_CACHEABLE_BUFFERABLE) SEP \ + FUNC(axi_mem_encoding, WRITE_THROUGH_NO_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_THROUGH_READ_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_THROUGH_WRITE_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_THROUGH_READ_AND_WRITE_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_BACK_NO_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_BACK_READ_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_BACK_WRITE_ALLOCATE) SEP \ + FUNC(axi_mem_encoding, WRITE_BACK_READ_AND_WRITE_ALLOCATE) + +#define EXPAND_AXI_PORT(FUNC, SEP) \ + FUNC(axi_port, SRAM) SEP \ + FUNC(axi_port, EXT) + +#define EXPAND_BRANCH_COND(FUNC, SEP) \ + FUNC(branch_cond, ALWAYS) SEP \ + FUNC(branch_cond, RF_TRUE) + +#define EXPAND_BROADCAST_MODE(FUNC, SEP) \ + FUNC(broadcast_mode, NONE) SEP \ + FUNC(broadcast_mode, H) SEP \ + FUNC(broadcast_mode, W) SEP \ + FUNC(broadcast_mode, HW) SEP \ + FUNC(broadcast_mode, C) SEP \ + FUNC(broadcast_mode, CH) SEP \ + FUNC(broadcast_mode, CW) SEP \ + FUNC(broadcast_mode, CWH) SEP \ + FUNC(broadcast_mode, SCALAR) + +#define EXPAND_CMD0_OPCODE(FUNC, SEP) \ + FUNC(cmd0_opcode, NPU_OP_STOP) SEP \ + FUNC(cmd0_opcode, NPU_OP_IRQ) SEP \ + FUNC(cmd0_opcode, NPU_OP_CONV) SEP \ + FUNC(cmd0_opcode, NPU_OP_DEPTHWISE) SEP \ + FUNC(cmd0_opcode, NPU_OP_POOL) SEP \ + FUNC(cmd0_opcode, NPU_OP_ELEMENTWISE) SEP \ + FUNC(cmd0_opcode, NPU_OP_RESIZE) SEP \ + FUNC(cmd0_opcode, NPU_OP_DMA_START) SEP \ + FUNC(cmd0_opcode, NPU_OP_DMA_WAIT) SEP \ + FUNC(cmd0_opcode, NPU_OP_KERNEL_WAIT) SEP \ + FUNC(cmd0_opcode, NPU_OP_PMU_MASK) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_PAD_TOP) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_PAD_LEFT) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_PAD_RIGHT) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_PAD_BOTTOM) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_DEPTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_PRECISION) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_UPSCALE) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_BROADCAST) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_ZERO_POINT) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_WIDTH0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_HEIGHT0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_HEIGHT1_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_WIDTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_DEPTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_PRECISION) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_BLK_WIDTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_BLK_HEIGHT_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_BLK_DEPTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_ZERO_POINT) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_WIDTH0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_HEIGHT1_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_OFM_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_KERNEL_WIDTH_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_KERNEL_HEIGHT_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_KERNEL_STRIDE) SEP \ + FUNC(cmd0_opcode, NPU_SET_ACC_FORMAT) SEP \ + FUNC(cmd0_opcode, NPU_SET_ACTIVATION) SEP \ + FUNC(cmd0_opcode, NPU_SET_ACTIVATION_MIN) SEP \ + FUNC(cmd0_opcode, NPU_SET_ACTIVATION_MAX) SEP \ + FUNC(cmd0_opcode, NPU_SET_WEIGHT_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_SCALE_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_RESIZE_X_SCALE_N_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_RESIZE_Y_SCALE_N_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_RESIZE_X_OFFSET) SEP \ + FUNC(cmd0_opcode, NPU_SET_RESIZE_Y_OFFSET) SEP \ + FUNC(cmd0_opcode, NPU_SET_WEIGHT_FORMAT) SEP \ + FUNC(cmd0_opcode, NPU_SET_BLOCKDEP) SEP \ + FUNC(cmd0_opcode, NPU_SET_DMA0_SRC_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_DMA0_DST_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_DMA0_SIZE0) SEP \ + FUNC(cmd0_opcode, NPU_SET_DMA0_SIZE1) SEP \ + FUNC(cmd0_opcode, NPU_SET_DMA0_IDX_REGION) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_BROADCAST) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_PRECISION) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_ZERO_POINT) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_WIDTH0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_HEIGHT0_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_HEIGHT1_M1) SEP \ + FUNC(cmd0_opcode, NPU_SET_IFM2_REGION) + +#define EXPAND_CMD1_OPCODE(FUNC, SEP) \ + FUNC(cmd1_opcode, NPU_SET_IFM_BASE0) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_BASE1) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_BASE2) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_BASE3) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_X) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_Y) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_STRIDE_C) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_BASE0) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_BASE1) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_BASE2) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_BASE3) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_X) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_Y) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_STRIDE_C) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT_BASE) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT_LENGTH) SEP \ + FUNC(cmd1_opcode, NPU_SET_SCALE_BASE) SEP \ + FUNC(cmd1_opcode, NPU_SET_SCALE_LENGTH) SEP \ + FUNC(cmd1_opcode, NPU_SET_OFM_SCALE) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM_SCALE) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_SCALE) SEP \ + FUNC(cmd1_opcode, NPU_SET_OP_SCALAR) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_SRC) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_DST) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_LEN) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_SRC_STRIDE0) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_SRC_STRIDE1) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_DST_STRIDE0) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_DST_STRIDE1) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_IDX) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_IDX_MAX) SEP \ + FUNC(cmd1_opcode, NPU_SET_DMA0_IDX_SKIP1) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_BASE0) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_BASE1) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_BASE2) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_BASE3) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_X) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_Y) SEP \ + FUNC(cmd1_opcode, NPU_SET_IFM2_STRIDE_C) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT1_BASE) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT1_LENGTH) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT2_BASE) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT2_LENGTH) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT3_BASE) SEP \ + FUNC(cmd1_opcode, NPU_SET_WEIGHT3_LENGTH) SEP \ + FUNC(cmd1_opcode, NPU_SET_RESIZE_X) SEP \ + FUNC(cmd1_opcode, NPU_SET_RESIZE_Y) SEP \ + FUNC(cmd1_opcode, NPU_OP_BRANCH) + +#define EXPAND_CMD_CTRL(FUNC, SEP) \ + FUNC(cmd_ctrl, CMD0_CTRL) SEP \ + FUNC(cmd_ctrl, CMD1_CTRL) + +#define EXPAND_CUSTOM_DMA(FUNC, SEP) \ + FUNC(custom_dma, NOT_IMPLEMENTED) SEP \ + FUNC(custom_dma, IMPLEMENTED) + +#define EXPAND_DMA_FAULT_CHANNEL(FUNC, SEP) \ + FUNC(dma_fault_channel, CMD_READ) SEP \ + FUNC(dma_fault_channel, IFM_READ) SEP \ + FUNC(dma_fault_channel, WEIGHT_READ) SEP \ + FUNC(dma_fault_channel, SBS_READ) SEP \ + FUNC(dma_fault_channel, MEM2MEM_READ) SEP \ + FUNC(dma_fault_channel, OFM_WRITE) SEP \ + FUNC(dma_fault_channel, MEM2MEM_WRITE) + +#define EXPAND_DMA_FAULT_SRC(FUNC, SEP) \ + FUNC(dma_fault_src, SRAM) SEP \ + FUNC(dma_fault_src, EXT) + +#define EXPAND_DMA_IDX_MODE(FUNC, SEP) \ + FUNC(dma_idx_mode, DISABLED) SEP \ + FUNC(dma_idx_mode, ENABLED) + +#define EXPAND_DMA_REGION_MODE(FUNC, SEP) \ + FUNC(dma_region_mode, EXTERNAL) SEP \ + FUNC(dma_region_mode, INTERNAL) + +#define EXPAND_DMA_STRIDE_MODE(FUNC, SEP) \ + FUNC(dma_stride_mode, D1) SEP \ + FUNC(dma_stride_mode, D2) SEP \ + FUNC(dma_stride_mode, D3) + +#define EXPAND_ELEMENTWISE_MODE(FUNC, SEP) \ + FUNC(elementwise_mode, MUL) SEP \ + FUNC(elementwise_mode, ADD) SEP \ + FUNC(elementwise_mode, SUB) SEP \ + FUNC(elementwise_mode, MIN) SEP \ + FUNC(elementwise_mode, MAX) SEP \ + FUNC(elementwise_mode, LRELU) SEP \ + FUNC(elementwise_mode, ABS) SEP \ + FUNC(elementwise_mode, CLZ) SEP \ + FUNC(elementwise_mode, SHR) SEP \ + FUNC(elementwise_mode, SHL) SEP \ + FUNC(elementwise_mode, LSR) SEP \ + FUNC(elementwise_mode, DIV) SEP \ + FUNC(elementwise_mode, CMP_EQ) SEP \ + FUNC(elementwise_mode, CMP_NE) SEP \ + FUNC(elementwise_mode, CMP_GE) SEP \ + FUNC(elementwise_mode, CMP_GT) SEP \ + FUNC(elementwise_mode, AND) SEP \ + FUNC(elementwise_mode, OR) SEP \ + FUNC(elementwise_mode, XOR) SEP \ + FUNC(elementwise_mode, NOT) SEP \ + FUNC(elementwise_mode, AND_NOT) + +#define EXPAND_IFM_UPSCALE_MODE(FUNC, SEP) \ + FUNC(ifm_upscale_mode, NONE) SEP \ + FUNC(ifm_upscale_mode, NEAREST) SEP \ + FUNC(ifm_upscale_mode, ZEROS) + +#define EXPAND_KERNEL_DECOMPOSITION(FUNC, SEP) \ + FUNC(kernel_decomposition, D8X8) SEP \ + FUNC(kernel_decomposition, D4X4) + +#define EXPAND_KERNEL_DILATION(FUNC, SEP) \ + FUNC(kernel_dilation, NONE) SEP \ + FUNC(kernel_dilation, X2) + +#define EXPAND_MAX_BEATS(FUNC, SEP) \ + FUNC(max_beats, B64) SEP \ + FUNC(max_beats, B128) SEP \ + FUNC(max_beats, B256) + +#define EXPAND_MICROBLOCK(FUNC, SEP) \ + FUNC(microblock, U1X1) SEP \ + FUNC(microblock, U1X2) SEP \ + FUNC(microblock, U1X4) SEP \ + FUNC(microblock, U2X2) SEP \ + FUNC(microblock, U2X4) SEP \ + FUNC(microblock, U4X4) + +#define EXPAND_OFM_SCALE_MODE(FUNC, SEP) \ + FUNC(ofm_scale_mode, PER_CHANNEL) SEP \ + FUNC(ofm_scale_mode, GLOBAL) + +#define EXPAND_PMU_AXI_CHANNEL(FUNC, SEP) \ + FUNC(pmu_axi_channel, RD_CMD) SEP \ + FUNC(pmu_axi_channel, RD_IFM) SEP \ + FUNC(pmu_axi_channel, RD_WEIGHTS) SEP \ + FUNC(pmu_axi_channel, RD_SCALE_BIAS) SEP \ + FUNC(pmu_axi_channel, RD_MEM2MEM) SEP \ + FUNC(pmu_axi_channel, RD_IFM_STREAM) SEP \ + FUNC(pmu_axi_channel, RD_MEM2MEM_IDX) SEP \ + FUNC(pmu_axi_channel, WR_OFM) SEP \ + FUNC(pmu_axi_channel, WR_MEM2MEM) + +#define EXPAND_PMU_EVENT(FUNC, SEP) \ + FUNC(pmu_event, NO_EVENT) SEP \ + FUNC(pmu_event, CYCLE) SEP \ + FUNC(pmu_event, NPU_IDLE) SEP \ + FUNC(pmu_event, CC_STALLED_ON_BLOCKDEP) SEP \ + FUNC(pmu_event, CC_STALLED_ON_SHRAM_RECONFIG) SEP \ + FUNC(pmu_event, NPU_ACTIVE) SEP \ + FUNC(pmu_event, MAC_ACTIVE) SEP \ + FUNC(pmu_event, MAC_DPU_ACTIVE) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_W_OR_ACC) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_W) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_ACC) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_IB) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_INT_W) SEP \ + FUNC(pmu_event, MAC_STALLED_BY_INT_ACC) SEP \ + FUNC(pmu_event, AO_ACTIVE) SEP \ + FUNC(pmu_event, AO_STALLED_BY_BS_OR_OB) SEP \ + FUNC(pmu_event, AO_STALLED_BY_BS) SEP \ + FUNC(pmu_event, AO_STALLED_BY_OB) SEP \ + FUNC(pmu_event, AO_STALLED_BY_AB_OR_CB) SEP \ + FUNC(pmu_event, AO_STALLED_BY_AB) SEP \ + FUNC(pmu_event, AO_STALLED_BY_CB) SEP \ + FUNC(pmu_event, WD_ACTIVE) SEP \ + FUNC(pmu_event, WD_STALLED) SEP \ + FUNC(pmu_event, WD_STALLED_BY_WD_BUF) SEP \ + FUNC(pmu_event, WD_STALLED_BY_WS_FC) SEP \ + FUNC(pmu_event, WD_STALLED_BY_WS_TC) SEP \ + FUNC(pmu_event, WD_TRANS_WBLK) SEP \ + FUNC(pmu_event, WD_TRANS_WS_FC) SEP \ + FUNC(pmu_event, WD_TRANS_WS_TC) SEP \ + FUNC(pmu_event, WD_STALLED_BY_WS_SC0) SEP \ + FUNC(pmu_event, WD_STALLED_BY_WS_SC1) SEP \ + FUNC(pmu_event, WD_STALLED_BY_WS_SC2) SEP \ + FUNC(pmu_event, WD_STALLED_BY_WS_SC3) SEP \ + FUNC(pmu_event, WD_PARSE_ACTIVE_SC0) SEP \ + FUNC(pmu_event, WD_PARSE_ACTIVE_SC1) SEP \ + FUNC(pmu_event, WD_PARSE_ACTIVE_SC2) SEP \ + FUNC(pmu_event, WD_PARSE_ACTIVE_SC3) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_SC0) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_SC1) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_SC2) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_SC3) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_IN_SC0) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_IN_SC1) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_IN_SC2) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_IN_SC3) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_OUT_SC0) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_OUT_SC1) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_OUT_SC2) SEP \ + FUNC(pmu_event, WD_PARSE_STALL_OUT_SC3) SEP \ + FUNC(pmu_event, WD_TRANS_WS_SC0) SEP \ + FUNC(pmu_event, WD_TRANS_WS_SC1) SEP \ + FUNC(pmu_event, WD_TRANS_WS_SC2) SEP \ + FUNC(pmu_event, WD_TRANS_WS_SC3) SEP \ + FUNC(pmu_event, WD_TRANS_WB0) SEP \ + FUNC(pmu_event, WD_TRANS_WB1) SEP \ + FUNC(pmu_event, WD_TRANS_WB2) SEP \ + FUNC(pmu_event, WD_TRANS_WB3) SEP \ + FUNC(pmu_event, SRAM_RD_TRANS_ACCEPTED) SEP \ + FUNC(pmu_event, SRAM_RD_TRANS_COMPLETED) SEP \ + FUNC(pmu_event, SRAM_RD_DATA_BEAT_RECEIVED) SEP \ + FUNC(pmu_event, SRAM_RD_TRAN_REQ_STALLED) SEP \ + FUNC(pmu_event, SRAM_WR_TRANS_ACCEPTED) SEP \ + FUNC(pmu_event, SRAM_WR_TRANS_COMPLETED_M) SEP \ + FUNC(pmu_event, SRAM_WR_TRANS_COMPLETED_S) SEP \ + FUNC(pmu_event, SRAM_WR_DATA_BEAT_WRITTEN) SEP \ + FUNC(pmu_event, SRAM_WR_TRAN_REQ_STALLED) SEP \ + FUNC(pmu_event, SRAM_WR_DATA_BEAT_STALLED) SEP \ + FUNC(pmu_event, SRAM_ENABLED_CYCLES) SEP \ + FUNC(pmu_event, SRAM_RD_STALL_LIMIT) SEP \ + FUNC(pmu_event, SRAM_WR_STALL_LIMIT) SEP \ + FUNC(pmu_event, AXI_LATENCY_ANY) SEP \ + FUNC(pmu_event, AXI_LATENCY_32) SEP \ + FUNC(pmu_event, AXI_LATENCY_64) SEP \ + FUNC(pmu_event, AXI_LATENCY_128) SEP \ + FUNC(pmu_event, AXI_LATENCY_256) SEP \ + FUNC(pmu_event, AXI_LATENCY_512) SEP \ + FUNC(pmu_event, AXI_LATENCY_1024) SEP \ + FUNC(pmu_event, ECC_DMA) SEP \ + FUNC(pmu_event, ECC_MAC_IB) SEP \ + FUNC(pmu_event, ECC_MAC_AB) SEP \ + FUNC(pmu_event, ECC_AO_CB) SEP \ + FUNC(pmu_event, ECC_AO_OB) SEP \ + FUNC(pmu_event, ECC_AO_LUT) SEP \ + FUNC(pmu_event, EXT_RD_TRANS_ACCEPTED) SEP \ + FUNC(pmu_event, EXT_RD_TRANS_COMPLETED) SEP \ + FUNC(pmu_event, EXT_RD_DATA_BEAT_RECEIVED) SEP \ + FUNC(pmu_event, EXT_RD_TRAN_REQ_STALLED) SEP \ + FUNC(pmu_event, EXT_WR_TRANS_ACCEPTED) SEP \ + FUNC(pmu_event, EXT_WR_TRANS_COMPLETED_M) SEP \ + FUNC(pmu_event, EXT_WR_TRANS_COMPLETED_S) SEP \ + FUNC(pmu_event, EXT_WR_DATA_BEAT_WRITTEN) SEP \ + FUNC(pmu_event, EXT_WR_TRAN_REQ_STALLED) SEP \ + FUNC(pmu_event, EXT_WR_DATA_BEAT_STALLED) SEP \ + FUNC(pmu_event, EXT_ENABLED_CYCLES) SEP \ + FUNC(pmu_event, EXT_RD_STALL_LIMIT) SEP \ + FUNC(pmu_event, EXT_WR_STALL_LIMIT) + +#define EXPAND_PMU_PORT_DISABLE(FUNC, SEP) \ + FUNC(pmu_port_disable, ENABLE) SEP \ + FUNC(pmu_port_disable, DISABLE) + +#define EXPAND_POOLING_MODE(FUNC, SEP) \ + FUNC(pooling_mode, MAX) SEP \ + FUNC(pooling_mode, AVERAGE) SEP \ + FUNC(pooling_mode, REDUCE_SUM) SEP \ + FUNC(pooling_mode, SUM) SEP \ + FUNC(pooling_mode, NONE) SEP \ + FUNC(pooling_mode, MIN) SEP \ + FUNC(pooling_mode, ARGMAX_X) SEP \ + FUNC(pooling_mode, ARGMAX_Y) + +#define EXPAND_PRIVILEGE_LEVEL(FUNC, SEP) \ + FUNC(privilege_level, USER) SEP \ + FUNC(privilege_level, PRIVILEGED) + +#define EXPAND_RAM_ID(FUNC, SEP) \ + FUNC(ram_id, LUT) SEP \ + FUNC(ram_id, IB) SEP \ + FUNC(ram_id, AB) SEP \ + FUNC(ram_id, CB) SEP \ + FUNC(ram_id, OB) + +#define EXPAND_RESIZE_MODE(FUNC, SEP) \ + FUNC(resize_mode, BILINEAR) SEP \ + FUNC(resize_mode, REPLICATE) SEP \ + FUNC(resize_mode, NEAREST) + +#define EXPAND_ROUND_MODE_IFM(FUNC, SEP) \ + FUNC(round_mode_ifm, DOUBLE_SYMMETRIC) SEP \ + FUNC(round_mode_ifm, NATURAL) + +#define EXPAND_ROUND_MODE_OFM(FUNC, SEP) \ + FUNC(round_mode_ofm, DOUBLE_SYMMETRIC) SEP \ + FUNC(round_mode_ofm, NATURAL) SEP \ + FUNC(round_mode_ofm, DOUBLE_ASYMMETRIC) SEP \ + FUNC(round_mode_ofm, SYMMETRIC) SEP \ + FUNC(round_mode_ofm, TRUNCATE_TO_ZERO) SEP \ + FUNC(round_mode_ofm, TRUNCATE_TO_LOWER) + +#define EXPAND_SECURITY_LEVEL(FUNC, SEP) \ + FUNC(security_level, SECURE) SEP \ + FUNC(security_level, NON_SECURE) + +#define EXPAND_STATE(FUNC, SEP) \ + FUNC(state, STOPPED) SEP \ + FUNC(state, RUNNING) + +#define EXPAND_WD_ACTIVE_CORE(FUNC, SEP) \ + FUNC(wd_active_core, NONE) SEP \ + FUNC(wd_active_core, STANDARD) SEP \ + FUNC(wd_active_core, FAST) SEP \ + FUNC(wd_active_core, TENSOR) + +#define EXPAND_WEIGHT_FORMAT(FUNC, SEP) \ + FUNC(weight_format, SWD) SEP \ + FUNC(weight_format, FWD) + +#define EXPAND_WEIGHT_ORDER(FUNC, SEP) \ + FUNC(weight_order, DEPTH_FIRST) SEP \ + FUNC(weight_order, PART_KERNEL_FIRST) + +#define EXPAND_WEIGHT_SPARSITY(FUNC, SEP) \ + FUNC(weight_sparsity, NONE) SEP \ + FUNC(weight_sparsity, SPARSE_2_4) + +#ifdef __cplusplus +} +#endif diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp new file mode 100644 index 00000000..9623b239 --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.cpp @@ -0,0 +1,545 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u85_performance.hpp" + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "architecture/architecture.hpp" +#include "ethos_u85.hpp" + +namespace regor +{ + +static const Point2i s_SubkernelLimits[] = { + {0, 0}, // No kernel + {8, 8}, // Convolution + {8, 8}, // Depthwise + {1, 1}, // VectorProduct + {8, 8}, // Pooling + {8, 8}, // ReduceSum + {1, 1}, // Elementwise + {1, 1}, // Resize +}; + +static constexpr bool OpUsesMacs(EthosU85NpuOp npuOp) +{ + return (npuOp != EthosU85NpuOp::Elementwise && npuOp != EthosU85NpuOp::Resize && npuOp != EthosU85NpuOp::Dma && npuOp != EthosU85NpuOp::None); +} + +EthosU85Performance::EthosU85Performance(ArchEthosU85 *arch, const EthosU85PerfInfo *perfInfo) : _arch(arch) +{ + _perfInfo = perfInfo; +} + +CycleCost EthosU85Performance::MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) +{ + CycleCost cycles; + auto npuOp = _arch->GetHWOp(query.type); + + // Convolution/Vector product cycle calculation + if ( OpUsesMacs(npuOp) ) + { + if ( (npuOp == EthosU85NpuOp::Depthwise) || (npuOp == EthosU85NpuOp::Pooling) ) + { + cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * 1; + } + else + { + cycles.macs = int64_t(query.kernel->ElementsWH()) * query.ofmShape.Elements() * query.ifmShape[0].Depth(); + } + + cycles.opCycles = EstimateConvCycles(query, fused); + } + // Elementwise cycle calculation + else if ( npuOp == EthosU85NpuOp::Elementwise ) + { + auto ofmShape = + (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape; + cycles.opCycles = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmShape.Elements())); + } + // Resize cycle calculation + else if ( npuOp == EthosU85NpuOp::Resize ) + { + // TODO: Implement for Resize + cycles.opCycles = 0; + } + // DMA cycle calculation + else if ( npuOp == EthosU85NpuOp::Dma ) + { + // TODO: below is incorrect (MLBEDSW-8400) + + auto ofmShape = + (query.ofmFormat == TensorFormat::NHCWB16) ? Shape::RoundAway(query.ofmShape, Shape(1, 1, 1, 16)) : query.ofmShape; + cycles.opCycles = 0; + } + else + { + assert(false && "Unknown operator cycle costing"); + } + + return cycles; +} + +int64_t EthosU85Performance::MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) +{ + int64_t fromCycles = int64_t(float(sizeBytes) / source->Bandwidth()); + fromCycles += source->ReadLatency(); + int64_t toCycles = int64_t(float(sizeBytes) / dest->Bandwidth()); + toCycles += source->WriteLatency(); + return std::max(fromCycles, toCycles); +} + +int64_t EthosU85Performance::EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused) +{ + EthosU85OpConfig *opConfig = static_cast(query.config); + auto npuOp = _arch->GetHWOp(query.type); + assert(npuOp != EthosU85NpuOp::None); + + Shape ifmBlock = Shape::Min(query.ifmShape[0], opConfig->IfmBlock()); + Shape ofmBlock = Shape::Min(query.ofmShape, opConfig->OfmBlock()); + Shape ofmUBlock = opConfig->OfmUBlock(); + + // HW Optimisation check + if ( (ofmUBlock.Height() == 2) && (npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::VectorProduct) && + (query.ofmShape.Height() == 1) && (query.ofmShape.Width() % 2 == 0) && // Optimisation only applies for even + // width tensors + (query.kernel->Size().y == 1) ) + { + ofmUBlock = Shape(1, 1, 4, ofmUBlock.Depth()); + ofmBlock = ofmBlock.WithHeight(1); + } + + int ifmBits = DataTypeSizeBits(query.ifmType[0]); + Shape numUBlocks = Shape::DivRoundUp(ofmBlock, ofmUBlock); + bool use48BitAcc = opConfig->Acc() == EthosU85Accumulator::Acc48; + + int64_t cyclesDpuBlk = 0; + int cyclesWb = 32 * ofmUBlock.Depth() / 8; + + int subKernelWidth = s_SubkernelLimits[int(npuOp)].x; + int subKernelHeight = s_SubkernelLimits[int(npuOp)].y; + const Point2i kernelSize = query.kernel->Size(); + bool isConvolutionMxN = (npuOp == EthosU85NpuOp::Convolution); + + for ( int x = 0; x < kernelSize.x; x += subKernelWidth ) + { + for ( int y = 0; y < kernelSize.y; y += subKernelHeight ) + { + int subKernelElements = std::min(kernelSize.y - y, subKernelHeight); + subKernelElements *= std::min(kernelSize.x - x, subKernelWidth); + + // Calculate processing cycles + int numKernelSteps = 0; + int cycles = 0; + if ( npuOp == EthosU85NpuOp::Pooling ) + { + numKernelSteps = 1; + cycles = std::max(4, subKernelElements) * numUBlocks.Elements() * (ifmBits / 2); + } + else if ( npuOp == EthosU85NpuOp::Depthwise ) + { + numKernelSteps = DivRoundUp(subKernelElements, 4); + cycles = 4 * numUBlocks.ElementsWH() * (ifmBits / 8); + cycles = std::max(cyclesWb, cycles) * numKernelSteps * numUBlocks.Depth(); + } + else if ( (isConvolutionMxN && opConfig->Traversal() != EthosU85Traversal::PartKernel) || + npuOp == EthosU85NpuOp::VectorProduct || npuOp == EthosU85NpuOp::ReduceSum ) + { + numKernelSteps = subKernelElements; + cycles = std::max(cyclesWb, 4 * numUBlocks.ElementsWH()) * numKernelSteps * numUBlocks.Depth(); + } + else + { + assert(opConfig->Traversal() == EthosU85Traversal::PartKernel); + int divider = (ifmBits == 16) ? 2 : 4; + numKernelSteps = DivRoundUp(subKernelElements, divider); + cycles = std::max(cyclesWb, 4 * numUBlocks.ElementsWH()) * numKernelSteps * numUBlocks.Depth() * + DivRoundUp(ifmBlock.Depth(), 8); + } + + // Calculate delay + int delayCycles = 0; + int delay = (use48BitAcc && (_arch->_macs <= 128)) ? 3 : 2; + + if ( numUBlocks.ElementsWH() == 1 ) + { + if ( numUBlocks.Depth() == 1 ) + { + delayCycles = delay * numKernelSteps; + } + else if ( numKernelSteps > 1 ) + { + delayCycles = delay * (numKernelSteps - 1) * numUBlocks.Depth(); + } + } + + if ( isConvolutionMxN && opConfig->Traversal() == EthosU85Traversal::PartKernel ) + { + delayCycles *= DivRoundUp(ifmBlock.Depth(), 8); + } + + cyclesDpuBlk += cycles; + cyclesDpuBlk += delayCycles; + } + } + + if ( npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::VectorProduct || npuOp == EthosU85NpuOp::ReduceSum ) + { + cyclesDpuBlk *= DivRoundUp(query.ifmShape[0].Depth(), ifmBlock.Depth()); + } + + cyclesDpuBlk /= _arch->_cores; + + // Estimate output cycles + int numOfmBlks = Shape::DivRoundUp(query.ofmShape, ofmBlock).Elements(); + int64_t cyclesOutputBlk = int64_t(EstimateOutputCyclesPerElement(query, fused) * float(ofmBlock.Elements())); + + // Scale and bias tensor + if ( query.constShape.Size() > 0 && query.constShape.Depth() > 0 ) + { + int cyclesBiasBlk = (10 * ofmBlock.Depth() * query.constMemory->ReadLatency() / 256); + cyclesOutputBlk = std::max(cyclesOutputBlk, int64_t(cyclesBiasBlk)); + } + + int64_t cycles_cmd = EstimateMinimumMemoryCycles(query); + cycles_cmd = (cycles_cmd + cyclesOutputBlk + cyclesDpuBlk) / 4; // Per DPU + + cyclesDpuBlk = std::max(cyclesDpuBlk, cycles_cmd); + cyclesOutputBlk = std::max(cyclesOutputBlk, cycles_cmd); + + int64_t totalCycles = 0; + if ( cyclesDpuBlk > cyclesOutputBlk ) + { + totalCycles = cyclesDpuBlk * numOfmBlks + cyclesOutputBlk; + } + else + { + totalCycles = cyclesOutputBlk * numOfmBlks + cyclesDpuBlk; + } + + return totalCycles; +} + +static int EstimateMemoryTransfer(int cores, bool isRead, ArchitectureMemory *memory, TensorFormat format, + int elementBits, const Shape &block, const Shape &shape, int toTransfer) +{ + int burstLen = 8; + + if ( format == TensorFormat::NHCWB16 ) + { + int zStride = (shape.Width() * elementBits * 16) / 8; + if ( zStride == block.Depth() ) + { + burstLen = elementBits * block.Depth() * block.Width(); + } + else if ( isRead ) + { + burstLen = 16 * elementBits * block.Width(); + } + else + { + burstLen = 16 * elementBits * block.Width() * cores; + } + } + else if ( format == TensorFormat::NHWC ) + { + int xStride = (shape.Depth() * elementBits) / 8; + if ( isRead ) + { + if ( xStride == block.Depth() ) + { + burstLen = elementBits * block.Depth() * block.Width(); + } + else + { + burstLen = elementBits * block.Depth(); + } + } + else + { + if ( (block.Depth() <= 16) && xStride == block.Depth() ) + { + burstLen = elementBits * block.Depth() * block.Width(); + } + else + { + burstLen = std::min(std::min(64 * 8, 16 * elementBits * cores), block.Depth() * elementBits); + } + } + } + + burstLen = std::min(memory->MaxBurstLength(), burstLen / 8); + assert(burstLen > 0 && "Burst length cannot be zero"); + return (toTransfer * memory->MaxBurstLength()) / burstLen; +} + + +int64_t EthosU85Performance::EstimateMinimumMemoryCycles(const PerformanceQuery &query) +{ + EthosU85OpConfig *opConfig = static_cast(query.config); + + int ifmBits = DataTypeSizeBits(query.ifmType[0]); // All inputs expect same bit width + const int ifmCount = query.ifmShape[1].Elements() > 0 ? int(std::size(query.ifmShape)) : 1; + int64_t cyclesIfm = 0; + for ( int i = 0; i < ifmCount; i++ ) + { + // Input block HW transfer (only for elements present) + int ifmBytes = Shape::Min(query.ifmShape[i], opConfig->IfmBlock()).Elements() * ifmBits / 8; + int64_t cyclesIfmBlk = query.ifmMemory[i]->ReadLatency(); + int64_t tx = EstimateMemoryTransfer(_arch->_cores, true, query.ifmMemory[i], query.ifmFormat[i], ifmBits, + opConfig->IfmBlock(), query.ifmShape[i], ifmBytes); + cyclesIfmBlk += int64_t(float(tx) / query.ifmMemory[i]->Bandwidth()); + + cyclesIfm = std::max(cyclesIfm, cyclesIfmBlk); + } + + // Output block HW transfer (only for elements present) + int ofmBits = DataTypeSizeBits(query.ofmType); + int ofmBytes = Shape::Min(query.ofmShape, opConfig->OfmBlock()).Elements() * ofmBits / 8; + int64_t cyclesOfm = query.ofmMemory->WriteLatency(); + int64_t tx = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat, ofmBits, + opConfig->OfmBlock(), query.ofmShape, ofmBytes); + cyclesOfm += int64_t(float(tx) / query.ofmMemory->Bandwidth()); + + return cyclesIfm + cyclesOfm; +} + + +float EthosU85Performance::EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused) +{ + EthosU85OpConfig *opConfig = static_cast(query.config); + auto npuOp = _arch->GetHWOp(query.type); + assert(npuOp != EthosU85NpuOp::None); + int ifmBits = DataTypeSizeBits(query.ifmType[0]); + int ofmBits = DataTypeSizeBits(query.ofmType); + int outputPerfIndex = 0; + + if ( (npuOp == EthosU85NpuOp::Elementwise) && (ifmBits == 32) ) + { + // Unary op else Binary op + outputPerfIndex = query.ifmShape[1].Elements() > 0 ? 1 : 0; + } + else if ( query.type == OpType::Mul && ofmBits == 32 ) + { + outputPerfIndex = 2; + } + else if ( (query.type == OpType::Mul) || ((npuOp != EthosU85NpuOp::Elementwise) && opConfig->Acc() == EthosU85Accumulator::Acc48) ) + { + outputPerfIndex = 3; + } + else if ( query.type == OpType::Add || query.type == OpType::Sub ) + { + if ( false ) + { + // Simple Add/Sub + outputPerfIndex = 4; + } + else + { + // Advanced Add/Sub TODO: Add as perf selection as operator variant + outputPerfIndex = 5; + } + } + else if ( query.type == OpType::MaxPool ) + { + outputPerfIndex = 6; + } + else + { + outputPerfIndex = 7; + } + + int activationPerfIndex = 0; + assert(fused.size() <= 1 && "multiple op performance not available"); + for ( const FusionQuery &fusedOp : fused ) + { + if ( fusedOp.type == OpType::Sigmoid || fusedOp.type == OpType::Tanh || fusedOp.type == OpType::LookupTable ) + { + activationPerfIndex = 0; + } + else if ( fusedOp.type == OpType::Relu || fusedOp.type == OpType::Relu6 || fusedOp.type == OpType::ReluN1To1 ) + { + activationPerfIndex = 1; + } + else + { + activationPerfIndex = 2; + } + } + + float cyclesPerElement = std::max(_perfInfo->outputCycles[outputPerfIndex], _perfInfo->activationCycles[activationPerfIndex]); + + if ( npuOp == EthosU85NpuOp::Elementwise ) + { + int numElemsBlk = opConfig->OfmBlock().Elements(); + assert(numElemsBlk > 0); + float cycleCmd = (float(EstimateMinimumMemoryCycles(query)) / float(numElemsBlk) + cyclesPerElement) / 4.0f; // per DPU + cyclesPerElement = std::max(cyclesPerElement, cycleCmd); + } + + return cyclesPerElement; +} + +ElementAccess EthosU85Performance::MeasureElementAccess(const PerformanceQuery &query) +{ + ElementAccess access; + EthosU85OpConfig *opConfig = static_cast(query.config); + auto npuOp = _arch->GetHWOp(query.type); + assert(npuOp != EthosU85NpuOp::None); + + Shape ifmRounding = _arch->GetStorageRounding(query.ifmFormat[0]); + Shape ofmRounding = _arch->GetStorageRounding(query.ofmFormat); + + // Convolution & pooling + if ( OpUsesMacs(npuOp) ) + { + Shape ifmBlock = Shape::Min(query.ifmShape[0], opConfig->IfmBlock()); + Shape ofmBlock = Shape::Min(query.ofmShape, opConfig->OfmBlock()); + + // Number of ofm blocks in the overall output shape + Shape ofmBlocks = Shape::DivRoundUp(query.ofmShape, ofmBlock); + + int ofmBlockDepth = ofmBlock.Depth(); + if ( npuOp == EthosU85NpuOp::Depthwise || npuOp == EthosU85NpuOp::Pooling ) + { + ofmBlocks = ofmBlocks.WithDepth(1); + ofmBlockDepth = query.ifmShape[0].Depth(); + } + + // Number of sub kernels + int subKernelWidth = s_SubkernelLimits[int(npuOp)].x; + int subKernelHeight = s_SubkernelLimits[int(npuOp)].y; + int subkernels = DivRoundUp(query.kernel->Size().x, subKernelWidth) * DivRoundUp(query.kernel->Size().y, subKernelHeight); + + int ifmFetch = + (Shape::RoundAway(ifmBlock, ifmRounding).ElementsWH() * Shape::RoundAway(query.ifmShape[0], ifmRounding).Depth()); + + int kernelRead = query.kernel->Size().AreaXY(); + if ( (npuOp != EthosU85NpuOp::Depthwise) && (npuOp != EthosU85NpuOp::Pooling) ) + { + kernelRead *= query.ifmShape[0].Depth(); + } + + int ofmBlockCount = ofmBlocks.Elements(); + + access.ifmRead[0] = ifmFetch * subkernels * ofmBlockCount; + + if ( (npuOp != EthosU85NpuOp::Pooling) && (npuOp != EthosU85NpuOp::ReduceSum) ) + { + int weightFetch = kernelRead * ofmBlockDepth * ofmBlockCount; + access.constRead[0] = weightFetch; + access.constRead[1] = query.ofmShape.Depth(); // Scales & biases + access.weightsRefetch = ofmBlocks.ElementsWH(); + } + } + else if ( npuOp == EthosU85NpuOp::Elementwise ) + { + // IFM1 is scalar + if ( query.ifmShape[0].Elements() == 1 ) + { + if ( DataTypeSizeBits(query.ifmType[0]) > 8 ) // IFM1 is a non 8-bit scalar + { + access.ifmRead[0] = Shape::RoundAway(query.ifmShape[0], ifmRounding).Elements(); + } + else if ( query.ifmShape[1].Elements() > 0 ) + { + access.ifmRead[1] = Shape::RoundAway(query.ofmShape, ifmRounding).Elements(); + } + } + else // IFM1 is not scalar + { + access.ifmRead[0] = Shape::RoundAway(query.ofmShape, ifmRounding).Elements(); + if ( query.ifmShape[1].Elements() > 0 ) + { + // IFM2 is not scalar + if ( query.ifmShape[1].Elements() > 1 ) + { + access.ifmRead[1] = access.ifmRead[0]; + } + else if ( DataTypeSizeBits(query.ifmType[1]) > 8 ) // IFM2 is a non 8-bit scalar + { + access.ifmRead[1] = Shape::RoundAway(query.ifmShape[1], ifmRounding).Elements(); + } + } + } + } + else if ( npuOp == EthosU85NpuOp::Resize ) + { + // TODO: Implement for Resize + access.ifmRead[0] = Shape::RoundAway(query.ifmShape[0], ifmRounding).Elements(); + access.ofmWrite = Shape::RoundAway(query.ofmShape[0], ofmRounding).Elements(); + } + else if ( npuOp == EthosU85NpuOp::Dma ) + { + if ( query.type == OpType::Gather ) + { + // One element from IFM0 (positions) is read per element in IFM1 (index) + access.ifmRead[0] = Shape::RoundAway(query.ifmShape[1], ifmRounding).Elements(); + + // Complete IFM1 (index) is read + access.ifmRead[1] = Shape::RoundAway(query.ifmShape[1], ifmRounding).Elements(); + + // Complete OFM is written + access.ofmWrite = Shape::RoundAway(query.ofmShape[0], ofmRounding).Elements(); + } + else + { + LOG_WARN("Missing element access estimation for DMA op {}\n", OpTypeToString(query.type).c_str()); + } + } + else + { + assert(false); + } + + access.ofmWrite = Shape::RoundAway(query.ofmShape, ofmRounding).Elements(); + + return access; +} + + +ElementAccess EthosU85Performance::ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) +{ + EthosU85OpConfig *opConfig = static_cast(query.config); + auto ifmBlock = opConfig ? opConfig->IfmBlock() : Shape(1, 1, 1, 1); + auto ofmBlock = opConfig ? opConfig->OfmBlock() : Shape(1, 1, 1, 1); + + ElementAccess result = access; + + // IFM bytes transferred + const int ifmCount = query.ifmShape[1].Elements() > 0 ? int(std::size(query.ifmShape)) : 1; + for ( int i = 0; i < ifmCount; i++ ) + { + result.ifmRead[i] = EstimateMemoryTransfer(_arch->_cores, true, query.ifmMemory[i], query.ifmFormat[i], + DataTypeSizeBits(query.ifmType[i]), ifmBlock, query.ifmShape[i], access.ifmRead[i]); + } + + // OFM bytes transferred + result.ofmWrite = EstimateMemoryTransfer(_arch->_cores, false, query.ofmMemory, query.ofmFormat, + DataTypeSizeBits(query.ofmType), ofmBlock, query.ofmShape, access.ofmWrite); + + // These requires compression ratio information + result.constRead[0] = 0; + result.constRead[1] = 0; + + return result; +} + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp new file mode 100644 index 00000000..114e4f2b --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_performance.hpp @@ -0,0 +1,60 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "architecture/architecture.hpp" + +namespace regor +{ + +class ArchEthosU85; + +struct EthosU85PerfInfo +{ + float outputCycles[8]; + float activationCycles[3]; +}; + +/// +/// Profiles performance analysis for Ethos-U85 +/// +class EthosU85Performance : public ArchitecturePerformance +{ +protected: + ArchEthosU85 *_arch; + const EthosU85PerfInfo *_perfInfo; + +public: + EthosU85Performance(ArchEthosU85 *arch, const EthosU85PerfInfo *perfInfo); + +public: + CycleCost MeasureCycleCost(const PerformanceQuery &query, const std::vector &fused) override; + int64_t MemToMemCycles(const ArchitectureMemory *dest, const ArchitectureMemory *source, int sizeBytes) override; + ElementAccess MeasureElementAccess(const PerformanceQuery &query) override; + ElementAccess ElementTransferToBytes(const PerformanceQuery &query, const ElementAccess &access) override; + +private: + int64_t EstimateConvCycles(const PerformanceQuery &query, const std::vector &fused); + float EstimateOutputCyclesPerElement(const PerformanceQuery &query, const std::vector &fused); + int64_t EstimateMinimumMemoryCycles(const PerformanceQuery &query); +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp new file mode 100644 index 00000000..cdab052f --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.cpp @@ -0,0 +1,1828 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u85_register_cs_generator.hpp" + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "architecture/ethos_u_scaling.hpp" +#include "common/data_type.hpp" +#include "compiler/high_level_command_stream.hpp" +#include "compiler/op_type.hpp" +#include "ethos_u85.hpp" +#define NPU_DISASSEMBLE +#define NPU_NAMESPACE ethosu85 +#include "ethos_u85_interface.hpp" +#include "ethos_u85_scaling.hpp" + +#include +#include +#include + +namespace regor +{ + +using namespace ethosu85; + +void EthosU85Emitter::Emit(uint32_t instr) +{ + uint16_t cmd = instr & 0xFFFF; + assert(IsCmd0(cmd)); + bool emit = IsOp(cmd) || SetRegister(cmd, instr); + if ( emit ) + { + _stream.push_back(instr); + } +} + +void EthosU85Emitter::Emit(uint64_t instr) +{ + uint16_t cmd = instr & 0xFFFF; + assert(IsCmd1(cmd)); + bool emit = IsOp(cmd) || SetRegister(cmd, instr); + if ( emit ) + { + _stream.push_back(uint32_t(instr)); + _stream.push_back(uint32_t(instr >> 32)); + } +} + +void EthosU85Emitter::Clear() +{ + _stream.clear(); + _registers.clear(); +} + + +bool EthosU85Emitter::SetRegister(uint16_t reg, uint64_t value) +{ + auto item = _registers.find(reg); + bool isChanged = item == _registers.end() || item->second != value; + if ( isChanged ) + { + _registers[reg] = value; + } + return isChanged; +} + +bool EthosU85Emitter::IsCmd0(uint16_t key) +{ + return (key >> 14) == uint16_t(cmd_ctrl::CMD0_CTRL); +} + +bool EthosU85Emitter::IsCmd1(uint16_t key) +{ + return (key >> 14) == uint16_t(cmd_ctrl::CMD1_CTRL); +} + +bool EthosU85Emitter::IsOp(uint16_t key) +{ + return IsCmd0(key) ? (key & (1 << 8)) == 0 : (key & (1 << 8)) != 0; +} + + +/// +/// Generates register command streams for Ethos U85. +/// +// TODO MLBEDSW-7985 add elementwise bitwise AND_NOT +namespace +{ +const std::unordered_map kElementwiseMap = { + {OpType::Add, elementwise_mode::ADD}, + {OpType::Sub, elementwise_mode::SUB}, + {OpType::Abs, elementwise_mode::ABS}, + {OpType::Mul, elementwise_mode::MUL}, + {OpType::Minimum, elementwise_mode::MIN}, + {OpType::Maximum, elementwise_mode::MAX}, + {OpType::LeakyRelu, elementwise_mode::LRELU}, + {OpType::CLZ, elementwise_mode::CLZ}, + {OpType::SHL, elementwise_mode::SHL}, + {OpType::SHR, elementwise_mode::LSR}, + {OpType::Div, elementwise_mode::DIV}, + {OpType::LogicalAnd, elementwise_mode::AND}, + {OpType::LogicalOr, elementwise_mode::OR}, + {OpType::LogicalXor, elementwise_mode::XOR}, + {OpType::LogicalNot, elementwise_mode::NOT}, + {OpType::And, elementwise_mode::AND}, + {OpType::Or, elementwise_mode::OR}, + {OpType::Xor, elementwise_mode::XOR}, + {OpType::Not, elementwise_mode::NOT}, + {OpType::Asr, elementwise_mode::SHR}, + {OpType::Equal, elementwise_mode::CMP_EQ}, + {OpType::Greater, elementwise_mode::CMP_GT}, + {OpType::GreaterEqual, elementwise_mode::CMP_GE}, + {OpType::NotEqual, elementwise_mode::CMP_NE}, +}; + +activation_type ToActivationType(DataType type) +{ + if ( IsSignedInteger(type) || IsBool(type) ) + { + return activation_type::SIGNED; + } + else + { + assert(IsInteger(type)); + return activation_type::UNSIGNED; + } +} + +activation_format ToActivationFormat(TensorFormat format) +{ + if ( format == TensorFormat::NHCWB16 ) + { + return activation_format::NHCWB16; + } + else + { + assert(format == TensorFormat::NHWC); + return activation_format::NHWC; + } +} + +activation_precision ToActivationPrecision(DataType type) +{ + switch ( DataTypeSizeBits(type) ) + { + case 8: + return activation_precision::B8; + case 16: + return activation_precision::B16; + case 32: + return activation_precision::B32; + case 64: + return activation_precision::B64; + default: + assert(false); + return activation_precision::B64; + } +} + +activation_transpose ToActivationTranspose(TransposeType type) +{ + switch ( type ) + { + case TransposeType::None: + case TransposeType::NHWC: + return activation_transpose::HWC; + case TransposeType::NWHC: + return activation_transpose::WHC; + case TransposeType::NHCW: + return activation_transpose::HCW; + case TransposeType::NWCH: + return activation_transpose::WCH; + case TransposeType::NCHW: + return activation_transpose::CHW; + case TransposeType::NCWH: + return activation_transpose::CWH; + default: + assert(false && "Unknown transpose mask"); + return activation_transpose::HWC; + } +} + +activation_reverse ToActivationReverse(ReverseType type) +{ + switch ( type ) + { + case ReverseType::None: + return activation_reverse::NONE; + case ReverseType::H: + return activation_reverse::H; + case ReverseType::W: + return activation_reverse::W; + case ReverseType::C: + return activation_reverse::C; + default: + assert(false && "Unknown reverse type"); + return activation_reverse::NONE; + } +} + +ifm_upscale_mode ToIfmUpscaleMode(ArchResampling resampling) +{ + if ( resampling == ArchResampling::Nearest ) + { + return ifm_upscale_mode::NEAREST; + } + if ( resampling == ArchResampling::Zeros ) + { + return ifm_upscale_mode::ZEROS; + } + return ifm_upscale_mode::NONE; +} + +resize_mode ToResizeMode(ArchResizeMode mode) +{ + if ( mode == ArchResizeMode::Bilinear ) + { + return resize_mode::BILINEAR; + } + if ( mode == ArchResizeMode::Nearest ) + { + return resize_mode::NEAREST; + } + return resize_mode::REPLICATE; +} + +round_mode_ofm GetOfmRoundingMode(const HLCOperation *op) +{ + switch ( op->rounding ) + { + case HLCRoundMode::NATURAL: + return round_mode_ofm::NATURAL; + case HLCRoundMode::TRUNCATE: + return round_mode_ofm::TRUNCATE_TO_ZERO; + case HLCRoundMode::DBL: + return round_mode_ofm::DOUBLE_SYMMETRIC; + case HLCRoundMode::AUTO: + return round_mode_ofm::DOUBLE_SYMMETRIC; + case HLCRoundMode::TRUNCATE_TO_LOWER: + return round_mode_ofm::TRUNCATE_TO_LOWER; + case HLCRoundMode::DOUBLE_ASYMMETRIC: + return round_mode_ofm::DOUBLE_ASYMMETRIC; + case HLCRoundMode::SYMMETRIC: + return round_mode_ofm::SYMMETRIC; + default: + return round_mode_ofm::DOUBLE_SYMMETRIC; + } +} +} // namespace + +uint32_t EthosU85RCSGenerator::ConfigRegister(int macs, int cmdStreamVersion, int numAxiSram, int numAxiExt, int numWd, int product) +{ + return config_r{} + .set_macs_per_cc(macs) + .set_cmd_stream_version(cmdStreamVersion) + .set_num_axi_sram(numAxiSram) + .set_num_axi_ext(numAxiExt) + .set_num_wd(numWd) + .set_product(product); +} + +uint32_t EthosU85RCSGenerator::IdRegister() +{ + return id_r{}; +} + +bool EthosU85RCSGenerator::IsSupportedElementwise(const OpType opType) +{ + return kElementwiseMap.count(opType) != 0; +} + +EthosU85RCSGenerator::EthosU85RCSGenerator(ArchEthosU85 *arch) : _arch(arch) +{ +} + +void EthosU85RCSGenerator::Emit(uint32_t instr) +{ + _emit.Emit(instr); +} + +void EthosU85RCSGenerator::Emit(uint64_t instr) +{ + _emit.Emit(instr); +} + + +int EthosU85RCSGenerator::GetDoubleBufferOffset(HLCWeights *weights, int rangeIndex) +{ + int doubleBufferOffset = 0; + if ( weights->buffering == Buffering::Double ) + { + assert(weights->subStreams > 0); + int depthIndex = rangeIndex / weights->subStreams; + if ( depthIndex % 2 == 1 ) + { + doubleBufferOffset = weights->maxRangeBytes; + } + } + return doubleBufferOffset; +} + + +void EthosU85RCSGenerator::CheckAddressRange(ArchitectureMemory *memory, Address address, int size) +{ + assert(address >= 0); + if ( address >= memory->SizeBytes() ) + { + LOG_ERROR("Error: Address out of bounds, address {0}, memory '{1}' with size {2}\n", address, memory->Name(), + memory->SizeBytes()); + // TODO: replace assert by error handling + assert(false && "Address out of bounds"); + } + assert(size >= 0); + if ( address + size > memory->SizeBytes() ) + { + LOG_ERROR("Error: Address offset out of bounds, address {0}, offset {1}, memory '{2}' with size {3}\n", address, + size, memory->Name(), memory->SizeBytes()); + // TODO: replace assert by error handling + assert(false && "address offset out of bounds"); + } +} + +void EthosU85RCSGenerator::CheckAddresses(const HLCFeatureMap &fm) +{ + CheckAddressRange(fm.memArea.memory, fm.address, fm.AllocationSizeBytes()); + assert(fm.address % 16 == 0 || fm.format != TensorFormat::NHCWB16); +} + +// Calculates the rolling buffer address of the given coordinate. +Address EthosU85RCSGenerator::AddressForCoordinate(const HLCFeatureMap &fm, const Shape &strides, const Shape &coord) +{ + Shape truncatedCoord = Shape::PadAxes(coord, 4, 0) % Shape::PadAxes(fm.shape, 4, 1); + int offset = 0; + if ( fm.format == TensorFormat::NHWC ) + { + offset = strides.Dot(truncatedCoord); + } + else if ( fm.format == TensorFormat::NHCWB16 ) + { + constexpr int BRICK = 16; + int elemSize = DataTypeSizeBits(fm.dataType) / 8; + int strideX = BRICK * elemSize; + offset = + truncatedCoord.Height() * strides.Height() + truncatedCoord.Width() * strideX + + (truncatedCoord.Depth() / BRICK) * strides.Depth() + (truncatedCoord.Depth() % BRICK) * elemSize + + truncatedCoord.Batch() * strides.Batch(); + } + else + { + assert(false); + } + return fm.address + offset; +} + +// Calculates tile sizes/addresses of a feature map +TileBox EthosU85RCSGenerator::GetTiles(const HLCFeatureMap &fm, const Shape &strides, const Box &area) +{ + int crossingY = RoundAway(area.Start().Height() + 1, fm.shape.Height()); + crossingY = std::min(crossingY, area.End().Height()); + int crossingX = RoundAway(area.Start().Width() + 1, fm.shape.Width()); + crossingX = std::min(crossingX, area.End().Width()); + TileBox tiles; + auto height = crossingY - area.Start().Height(); + auto width = crossingX - area.Start().Width(); + tiles.height0 = (height + fm.stepXY.y - 1) / fm.stepXY.y; + tiles.height1 = tiles.height0; + tiles.width0 = (width + fm.stepXY.x - 1) / fm.stepXY.x; + for ( int i = 0; i < 4; ++i ) + { + tiles.address[i] = 0; + } + int fmSize = fm.AllocationSizeBytes(); + tiles.address[0] = AddressForCoordinate(fm, strides, area.Start()); + assert(fm.address <= tiles.address[0] && tiles.address[0] < fm.address + fmSize); + if ( area.End().Width() > crossingX ) + { + tiles.address[1] = AddressForCoordinate(fm, strides, area.Start().WithWidth(crossingX)); + assert(fm.address <= tiles.address[1] && tiles.address[1] < fm.address + fmSize); + assert(false && "Striping in vertical direction is not supported"); + } + if ( area.End().Height() > crossingY ) + { + tiles.address[2] = AddressForCoordinate(fm, strides, area.Start().WithHeight(crossingY)); + assert(fm.address <= tiles.address[2] && tiles.address[2] < fm.address + fmSize); + } + if ( area.End().Width() > crossingX && area.End().Height() > crossingY ) + { + tiles.address[3] = AddressForCoordinate(fm, strides, area.Start().WithWidth(crossingX).WithHeight(crossingY)); + assert(fm.address <= tiles.address[3] && tiles.address[3] < fm.address + fmSize); + } + if ( fm.format == TensorFormat::NHCWB16 ) + { + for ( int i = 0; i < 4; ++i ) + { + assert(tiles.address[i] % 16 == 0 && "NHCWB16 base address is not 16-byte aligned"); + } + } + return tiles; +} + +MemoryAccess EthosU85RCSGenerator::ToMemoryAccess(const HLCFeatureMap &fm, const Box &area, AccessDirection direction) +{ + const auto &strides = fm.strides; + Address start = AddressForCoordinate(fm, strides, area.Start()); + // Note: due to truncating of shape, AddressForCoordinate(fm, .., fm.shape) returns + // fm.address; the - Shape(1, 1, 1) prevents this + Address end = AddressForCoordinate(fm, strides, area.End() - Shape(1, 1, 1)) + DataTypeSizeBits(fm.dataType) / 8; + if ( end < start ) + { + // Area wraps around the end of the feature map + start = fm.address; + end = fm.address + fm.AllocationSizeBytes(); + } + return MemoryAccess(direction, fm.memArea, start, end); +} + +// Returns region number used in NPU_SET_..._REGION +uint32_t EthosU85RCSGenerator::ToRegion(const MemArea &memArea) +{ + auto region = BasePointerIndex::WeightTensor; + if ( memArea == _arch->FeatureMapMemory() ) + { + region = BasePointerIndex::ScratchTensor; + } + else if ( memArea == _arch->StagingMemory() ) + { + region = BasePointerIndex::ScratchFastTensor; + } + else if ( memArea == _arch->LUTMemory() ) + { + region = BasePointerIndex::Mem2Mem; + } + else + { + assert(memArea == _arch->ReadonlyMemory()); + } + return uint32_t(region); +} + +bool EthosU85RCSGenerator::UseZeroPoint0(OpType opType, const HLCFeatureMap &fm, bool isOFM) +{ + if ( fm.quantization.forceZeroPoint ) + { + return false; + } + if ( fm.quantization.zeroPoints.empty() || (fm.dataType == DataType::Int32 && !isOFM) ) + { + return true; + } + return opType == OpType::AvgPool || opType == OpType::Resize || opType == OpType::CLZ || opType == OpType::SHL || opType == OpType::Div; +} + + +// Checks if the feature map is a scalar, and if so, returns the +// quantized value in scalarValue. +bool EthosU85RCSGenerator::IsScalar(const HLCFeatureMap &fm, int32_t &scalarValue) +{ + const auto &view = fm.bufferView; + // A 1-sized feature map in constant memory is a scalar + bool isScalar = fm.shape.Elements() == 1 && view.HasBuffer(); + if ( isScalar ) + { + if ( fm.dataType == DataType::Int8 ) + { + scalarValue = view.Values()[0]; + } + else if ( fm.dataType == DataType::UInt8 ) + { + scalarValue = view.Values()[0]; + } + else if ( fm.dataType == DataType::Int16 ) + { + scalarValue = view.Values()[0]; + } + else if ( fm.dataType == DataType::Int32 ) + { + scalarValue = view.Values()[0]; + } + else + { // Unsupported scalar value + isScalar = false; + } + } + return isScalar; +} + + +// Calculates waits for KERNEL_WAIT/DMA_WAIT, returns -1 if no wait is needed +// - opAccesses contains the memory accesses for the current operation +// - outstanding contains the memory accesses for ongoing "other" operations +// (DMA operations if the current op is an NPU operation, NPU operations if the current op is a DMA operation) +// Note: NPU - NPU dependency is handled via blockdep +int EthosU85RCSGenerator::CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque &outstanding) +{ + int waits = 0; + for ( int index = int(outstanding.size()) - 1; index >= 0; ++waits, --index ) + { + for ( const auto &access : opAccesses ) + { + for ( const auto &outstandingAccess : outstanding[index] ) + { + if ( access.Conflicts(outstandingAccess) ) + { + // Current op needs to wait, and after it has waited, + // outstanding[0..index] are not outstanding any longer + for ( int i = 0; i <= index; ++i ) + { + outstanding.pop_front(); + } + return waits; + } + } + } + } + return -1; +} + +// Returns LUT slot to be used for the given LUT operation. +// Sets alreadyInLutMem to true if the LUT is already in SHRAM. +int EthosU85RCSGenerator::AllocateLutSlot( + std::vector &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem) +{ + alreadyInLutMem = false; + int totalSlots = int(lutSlots.size()); + if ( sizeInSlots < 0 || sizeInSlots > totalSlots ) + { + assert(false); + return 0; + } + // Returns least recently used slot, unless the LUT is already in memory + int allocatedSlot = 0; + for ( int i = 0; i < totalSlots; i += sizeInSlots ) + { + if ( lutSlots[i].hlcOp == op ) + { + // LUT is already in SHRAM + allocatedSlot = i; + alreadyInLutMem = true; + break; + } + if ( lutSlots[i].lastUsed < lutSlots[allocatedSlot].lastUsed ) + { + allocatedSlot = i; + } + } + for ( int j = allocatedSlot; j < allocatedSlot + sizeInSlots; ++j ) + { + lutSlots[j].hlcOp = op; + lutSlots[j].lastUsed = timestamp; + } + return allocatedSlot; +} + +//---------------------------------------------------------------------- +// Print +//---------------------------------------------------------------------- + +int EthosU85RCSGenerator::Disassemble(const uint32_t *in, std::string &op, std::vector> &fields) +{ + return isa::disassemble(in, op, fields); +} + +//---------------------------------------------------------------------- +// Scaling (OFM/IFM/IFM2_SCALE) +//---------------------------------------------------------------------- + +// Generates OFM_SCALE register for pooling operations +void EthosU85RCSGenerator::GenerateOFMScalingForPooling(HLCOperation *poolOp) +{ + QuantizedScale ofmScale(1, 0); + bool isNoOp = _arch->UseAvgPoolNop(poolOp->type); + ethosU85Scaling::RescalePooling(poolOp, isNoOp); + if ( !poolOp->ofm.quantization.scales.empty() ) + { + ofmScale = poolOp->ofm.quantization.scales[0]; + assert(unsigned(ofmScale.shift) < 64); + } + Emit(isa::npu_set_ofm_scale_t(uint32_t(ofmScale.shift), 0, GetOfmRoundingMode(poolOp), ofmScale.scale)); +} + +// Generates OFM/IFM/IMF2_SCALE registers for elementwise operators. +void EthosU85RCSGenerator::GenerateScalingForElementwise(HLCOperation *op) +{ + auto opType = op->type; + int ifmCnt = int(op->ifm.size()); + + QuantizedScale input1Scale(1, 0); + QuantizedScale input2Scale(1, 0); + QuantizedScale outScale(1, 0); + ethosU85Scaling::RescaleElementwise(op); + + auto ifmRoundMode = round_mode_ifm::DOUBLE_SYMMETRIC; + uint32_t ifmDoubleRound = 0; + auto ofmRoundMode = GetOfmRoundingMode(op); + uint32_t ofmDoubleRound = 0; + + int bitDepth = DataTypeSizeBits(op->ifm[0].dataType); + if ( opType == OpType::Mul || opType == OpType::Abs || opType == OpType::LeakyRelu ) + { + if ( !op->ofm.quantization.scales.empty() ) + { + outScale = op->ofm.quantization.scales[0]; + } + if ( opType == OpType::LeakyRelu ) + { + if ( !op->ifm[0].quantization.scales.empty() ) + { + input1Scale = op->ifm[0].quantization.scales[0]; + } + const HLCParameters *params = &op->parameters; + float alpha = params->leaky_relu.alpha; + float ifm1Scale = input1Scale.Dequantize(); + input2Scale = QuantizedScale(alpha * ifm1Scale); + ifmCnt = 2; + } + } + else if ( opType == OpType::Div ) + { + // Div operations require unit scaling + if ( !op->ifm[0].quantization.scales.empty() ) + { + auto scale = op->ifm[0].quantization.scales[0]; + assert(scale.scale == 1); + assert(scale.shift == 0); + } + if ( ifmCnt == 2 && !op->ifm[1].quantization.scales.empty() ) + { + auto scale = op->ifm[1].quantization.scales[0]; + assert(scale.scale == 1); + assert(scale.shift == 0); + } + if ( !op->ofm.quantization.scales.empty() ) + { + auto scale = op->ofm.quantization.scales[0]; + assert(scale.scale == 1); + assert(scale.shift == 0); + } + } + else if ( opType == OpType::Add || opType == OpType::Sub ) + { + // Double round is used to compensate for the left shift that happens in AdvancedElementwiseAddSubScale + ifmDoubleRound = op->ifm[0].dataType == DataType::Int8 ? 20 : 15; + if ( !op->ofm.quantization.scales.empty() && !op->ifm[0].quantization.scales.empty() && + !op->ifm[1].quantization.scales.empty() ) + { + outScale = op->ofm.quantization.scales[0]; + input1Scale = op->ifm[0].quantization.scales[0]; + input2Scale = op->ifm[1].quantization.scales[0]; + } + } + assert(unsigned(input1Scale.shift) < 64); + Emit(isa::npu_set_ifm_scale_t(input1Scale.shift, ifmDoubleRound, ifmRoundMode, input1Scale.scale)); + if ( ifmCnt == 2 ) + { + assert(unsigned(input2Scale.shift) < 64); + Emit(isa::npu_set_ifm2_scale_t(input2Scale.shift, ifmDoubleRound, ifmRoundMode, input2Scale.scale)); + } + assert(unsigned(outScale.shift) < 64); + Emit(isa::npu_set_ofm_scale_t(outScale.shift, ofmDoubleRound, ofmRoundMode, outScale.scale)); +} + +//---------------------------------------------------------------------- +// BLOCKDEP calculation +//---------------------------------------------------------------------- + +static Shape CalcIFMJobShape(const Shape &ofmBlock, Kernel *kernel, int ifmBlockDepth) +{ + Point2i dilatedSize = kernel->DilatedWH(); + // TODO MLBEDSW-8498: Consider ifm_upscale_mode for job-shape calculations + int h = RequiredInputSize(ofmBlock.Height(), kernel->Stride().y, dilatedSize.y, 1); + int w = RequiredInputSize(ofmBlock.Width(), kernel->Stride().x, dilatedSize.x, 1); + return Shape(1, h, w, ifmBlockDepth); +} + +// Given the area and block size, adds the first/last jobs (depending on fromStart) to jobs. +// - area: total amount of work to perform +// - jobShape: size of each job +// - fromStart: if true, the first jobs are added, if false, the last jobs are added +// (in that case, the very last job is added last) +void EthosU85RCSGenerator::GetJobs(const Box &area, const Shape &jobShape, int nrJobsToGet, bool fromStart, std::vector &jobs) +{ + Shape jobSplit = Shape::DivRoundUp(area.End() - area.Start(), jobShape); + int z = jobSplit.Depth(); + int w = jobSplit.Width(); + int h = jobSplit.Height(); + int n = z * w * h; // n = total number of jobs for the whole area + const auto &start = area.Start().Extract(-3, -2, -1); + const auto &end = area.End().Extract(-3, -2, -1); + int firstJob = fromStart ? 0 : std::max(0, n - nrJobsToGet); + int lastJob = fromStart ? std::min(n, nrJobsToGet) : n; + for ( int i = firstJob; i < lastJob; ++i ) + { + Shape from = Shape(start.Height() + (i / (z * w)) * jobShape.Height(), + start.Width() + ((i / z) % w) * jobShape.Width(), start.Depth() + (i % z) * jobShape.Depth()); + + jobs.emplace_back(from, Shape::Min(from + jobShape, end)); + } +} + +// Calculates the value for the BLOCKDEP register +int EthosU85RCSGenerator::CalcBlockDep(HLCStripe *prevStripe, HLCStripe *stripe) +{ + if ( prevStripe == nullptr ) + { + return 0; + } + const auto &op = stripe->operation; + const auto &prevOp = prevStripe->operation; + const auto &prevOfm = prevOp->ofm; + + // TODO: Investigate if this is correct + if ( !IsNone(prevOp->ofm.transpose) ) + { + return 0; + } + + int ifmIndex = (op->ifm.size() > 1 && op->ifm[1].address == prevOfm.address && op->ifm[1].memArea == prevOfm.memArea) ? 1 : 0; + const auto &ifm = op->ifm[ifmIndex]; + int maxJobs = _arch->MaxBlockdep(); + if ( ifm.address != prevOfm.address || ifm.memArea != prevOfm.memArea ) + { + for ( const auto &fm : op->ifm ) + { + if ( fm.memArea == prevOfm.memArea && + Overlaps(fm.address, fm.address + fm.AllocationSizeBytes(), prevOfm.address, prevOfm.address + prevOfm.AllocationSizeBytes()) ) + { + // Previous OFM overlaps in unexpected way with current IFM + assert(false && "Unexpected overlap previous OFM/current IFM"); + return 0; + } + } + // Previous operation does not produce current operation's IFM + return maxJobs; + } + if ( op->ifm.size() > 1 && ifm.AllocationSizeBytes() < op->ifm[1 - ifmIndex].AllocationSizeBytes() ) + { + // Prev OFM produces IFM2 which is broadcasted (this should be rare) + return 0; + } + if ( prevOfm.shape != ifm.shape ) + { + // OFM has been reshaped; the job overlap calculations below do not work in this case + return 0; + } + // Previous operation produces current operations IFM + auto prevConfig = static_cast(prevOp->config); + if ( !prevConfig ) + { + // Previous operation doesn't have a block config + return 0; + } + Shape prevBlock = prevConfig->OfmBlock(); + auto config = static_cast(op->config); + if ( !config ) + { + // Current operation doesn't have a block config + return 0; + } + Shape currBlock = CalcIFMJobShape(config->OfmBlock(), &op->kernel, config->IfmBlock().Depth()); + // Get the last few jobs from the previous operation (each job produces a part of the current op's IFM) + std::vector lastPrevJobs; + GetJobs(prevStripe->ofmArea, prevBlock, maxJobs, false, lastPrevJobs); + // Get the first few jobs from the current operation (each job consumes a part of the current op's IFM) + std::vector firstCurrJobs; + GetJobs(stripe->ifmAreas[ifmIndex], currBlock, maxJobs, true, firstCurrJobs); + // Find the highest blockdep such that there is no overlap between + // any job from the previous op with any job from the current op during blockdep jobs + int sz = int(std::min(lastPrevJobs.size(), firstCurrJobs.size())); + int prevLastIx = int(lastPrevJobs.size()) - 1; + for ( int blockdep = 0; blockdep < sz; ++blockdep ) + { + bool overlaps = false; + for ( int i = 0; !overlaps && i <= blockdep; ++i ) + { + for ( int j = blockdep - i; !overlaps && i + j <= blockdep; ++j ) + { + if ( firstCurrJobs[i].Overlaps(lastPrevJobs[prevLastIx - j]) ) + { + overlaps = true; + } + } + } + if ( overlaps ) + { + return blockdep; + } + } + // No overlap found + return sz; +} + +//---------------------------------------------------------------------- +// Register generation +//---------------------------------------------------------------------- + +void EthosU85RCSGenerator::GeneratePadding(const HLCPadding &padding) +{ + Emit(isa::npu_set_ifm_pad_top_t(padding.top)); + Emit(isa::npu_set_ifm_pad_left_t(padding.left)); + Emit(isa::npu_set_ifm_pad_bottom_t(padding.bottom)); + Emit(isa::npu_set_ifm_pad_right_t(padding.right)); +} + +// Generates ACTIVATION registers +void EthosU85RCSGenerator::GenerateActivation(const HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + const HLCOperation *op = stripe->operation.get(); + assert(op->subOps.size() <= 1); + OpType opType = OpType::None; + if ( IsActivation(op->type) ) + { + // Non-fused activation + opType = op->type; + assert(op->subOps.empty() || opType == op->subOps[0].type || opType == OpType::Sigmoid || opType == OpType::Tanh); + } + else if ( !op->subOps.empty() ) + { + // Fused activation + opType = op->subOps[0].type; + } + auto &ofm = op->ofm; + auto &ifm = op->ifm[0]; + int size = std::min(16, DataTypeSizeBits(ofm.dataType)); + assert(size > 0 && "Illegal data type"); + bool isSigned = ToActivationType(ofm.dataType) == activation_type::SIGNED; + int64_t quantizedMin = isSigned ? -(1LL << (size - 1)) : 0; + int64_t quantizedMax = isSigned ? (1LL << (size - 1)) - 1 : (1 << size) - 1; + auto act = activation_function::LUT_NONE; + uint32_t tableIndex = 0; + auto clipRange = DataTypeSizeBits(ofm.dataType) > 16 ? activation_clip_range::NONE : activation_clip_range::B16; + if ( ofm.quantization.quantMin.size() ) + { + quantizedMin = std::max(quantizedMin, ofm.quantization.quantMin[0]); + } + if ( ofm.quantization.quantMax.size() ) + { + quantizedMax = std::min(quantizedMax, ofm.quantization.quantMax[0]); + } + + if ( opType == OpType::LUT || opType == OpType::Sigmoid || opType == OpType::Tanh ) + { + auto ¶m = op->subOps[0].parameters.lut; + int lutSize = param.sizeBytes; + + auto pos = _stripeToLutSlot.find(stripe); + if ( pos != _stripeToLutSlot.end() ) + { + tableIndex = pos->second; + } + else + { + assert(false && "Command uses lut, but no lut info found"); + } + + // tableIndex is based on 8 slots of size 256 and alignment is the same as the LUT size + // Hardware expects 0-7 tables for 256 LUT + // 0-4 tables for 512 LUT + // 0-1 tables for 1k LUT + // 1 table for 2k LUT + // So for 512 and 1k the tableIndex is adjusted below + switch ( ofm.dataType ) + { + case DataType::Int8: + assert(lutSize == 256); + assert(param.ifmType == DataType::Int8); + act = activation_function::LUT_S8_S8; + break; + case DataType::UInt8: + assert(lutSize == 256); + assert(param.ifmType == DataType::UInt8); + act = activation_function::LUT_U8_U8; + break; + case DataType::Int16: + if ( param.ifmType == DataType::Int8 ) + { + assert(lutSize == 512 && tableIndex % 2 == 0); + act = activation_function::LUT_S8_S16; + } + else + { + assert(lutSize == 2048 && tableIndex == 0); + assert(param.ifmType == DataType::Int16); + if ( opType == OpType::LUT ) act = activation_function::LUT_S16_S16; + else if ( opType == OpType::Sigmoid ) act = activation_function::LUT_SIGMOID; + else act = activation_function::LUT_TANH; + } + break; + case DataType::Int32: + if ( param.ifmType == DataType::Int8 ) + { + assert(lutSize == 1024 && tableIndex % 4 == 0); + act = activation_function::LUT_S8_S32; + } + else + { + assert(lutSize == 2048 && tableIndex == 0); + assert(param.ifmType == DataType::Int16); + act = activation_function::LUT_S16_S32; + } + break; + default: + assert(false && "Unsupported LUT table"); + break; + } + + // Adjust table for 512 and 1k + tableIndex = tableIndex / (lutSize / ArchEthosU85::LUT_SLOT_SIZE); + + Address lutStart = Address(tableIndex) * lutSize; + memoryAccesses.emplace_back(AccessDirection::Read, _arch->LUTMemory(), lutStart, lutStart + lutSize); + } + assert(quantizedMin <= std::numeric_limits::max()); + assert(quantizedMax <= std::numeric_limits::max()); + Emit(isa::npu_set_activation_t(act, tableIndex, clipRange)); + Emit(isa::npu_set_activation_min_t(uint32_t(quantizedMin))); + Emit(isa::npu_set_activation_max_t(uint32_t(quantizedMax))); +} + +// Generates KERNEL related registers +void EthosU85RCSGenerator::GenerateKernel(const Kernel &kernel, bool partKernel) +{ + auto dilatedWH = kernel.DilatedWH(); + Emit(isa::npu_set_kernel_height_m1_t(dilatedWH.y - 1)); + Emit(isa::npu_set_kernel_width_m1_t(dilatedWH.x - 1)); + uint32_t stride_x_lsb = (kernel.Stride().x - 1) & 1; + uint32_t stride_y_lsb = (kernel.Stride().y - 1) & 1; + uint32_t stride_x_msb = ((kernel.Stride().x - 1) >> 1) & 1; + uint32_t stride_y_msb = ((kernel.Stride().y - 1) >> 1) & 1; + auto weightOrder = partKernel ? weight_order::PART_KERNEL_FIRST : weight_order::DEPTH_FIRST; + kernel_dilation dilation_x = kernel_dilation(kernel.Dilation().x - 1); + kernel_dilation dilation_y = kernel_dilation(kernel.Dilation().y - 1); + kernel_decomposition decomposition = kernel_decomposition::D8X8; // Kernel decomposition + Emit(isa::npu_set_kernel_stride_t( + stride_x_lsb, stride_y_lsb, weightOrder, dilation_x, dilation_y, decomposition, stride_x_msb, stride_y_msb)); +} + + +// Generates IFM_BROADCAST/IFM2_BROADCAST register for binary elementwise operations +static broadcast_mode CalculateBroadcast(const Shape &shape1, const Shape &shape2) +{ + uint8_t mode = uint8_t(broadcast_mode::NONE); + if ( shape1.Height() < shape2.Height() && shape1.Height() == 1 ) + { + // Broadcast in 'H' dimension + mode |= uint8_t(broadcast_mode::H); + } + if ( shape1.Width() < shape2.Width() && shape1.Width() == 1 ) + { + // Broadcast in 'W' dimension + mode |= uint8_t(broadcast_mode::W); + } + if ( shape1.Depth() < shape2.Depth() && shape1.Depth() == 1 ) + { + // Broadcast in 'C' dimension + mode |= uint8_t(broadcast_mode::C); + } + return broadcast_mode(mode); +} + +void EthosU85RCSGenerator::GenerateInputBroadcast(const Shape &ifmShape, const Shape &ifm2Shape, bool ifmIsScalar, bool ifm2IsScalar) +{ + assert(!(ifmIsScalar && ifm2IsScalar)); + // IFM broadcast + auto broadcastMode = ifmIsScalar ? broadcast_mode::SCALAR : CalculateBroadcast(ifmShape, ifm2Shape); + Emit(isa::npu_set_ifm_broadcast_t(broadcastMode)); + + // IFM2 broadcast + broadcastMode = ifm2IsScalar ? broadcast_mode::SCALAR : CalculateBroadcast(ifm2Shape, ifmShape); + Emit(isa::npu_set_ifm2_broadcast_t(broadcastMode)); +} + +// Generates IFM_PRECISION register +void EthosU85RCSGenerator::GenerateIFMPrecision(const HLCFeatureMap &fm) +{ + activation_type type = ToActivationType(fm.dataType); + activation_precision precision = ToActivationPrecision(fm.dataType); + activation_format format = ToActivationFormat(fm.format); + activation_storage storage = activation_storage::TILE2X2; + Emit(isa::npu_set_ifm_precision_t(type, precision, format, storage)); +} + +// Generates IFM2_PRECISION register +void EthosU85RCSGenerator::GenerateIFM2Precision(const HLCFeatureMap &fm) +{ + activation_type type = ToActivationType(fm.dataType); + activation_precision precision = ToActivationPrecision(fm.dataType); + activation_format format = ToActivationFormat(fm.format); + activation_storage storage = activation_storage::TILE2X2; + Emit(isa::npu_set_ifm2_precision_t(type, precision, format, storage)); +} + +// Generates OFM_PRECISION register +void EthosU85RCSGenerator::GenerateOFMPrecision(const HLCFeatureMap &fm, bool useGlobalScale, bool enable_output) +{ + activation_type type = ToActivationType(fm.dataType); + activation_precision precision = ToActivationPrecision(fm.dataType); + activation_format format = ToActivationFormat(fm.format); + auto scaleMode = useGlobalScale ? ofm_scale_mode::GLOBAL : ofm_scale_mode::PER_CHANNEL; + activation_reverse reverse = ToActivationReverse(fm.reverse); + activation_transpose transpose = ToActivationTranspose(fm.transpose); + // TODO implement MLBEDSW-7867 storage + activation_storage storage = enable_output ? activation_storage::TILE2X2 : activation_storage::NONE; + if ( reverse != activation_reverse::NONE ) + { + assert(transpose == activation_transpose::HWC && "Can't combine reverse and transpose"); + assert(storage != activation_storage::CHAINED && "Can't combine reverse and chaining"); + } + if ( transpose != activation_transpose::HWC ) + { + assert(reverse == activation_reverse::NONE && "Can't combine transpose and reverse"); + assert(storage != activation_storage::CHAINED && "Can't combine transpose and chaining"); + } + if ( storage == activation_storage::CHAINED ) + { + assert(reverse == activation_reverse::NONE && "Can't combine chaining and reverse"); + assert(transpose == activation_transpose::HWC && "Can't combine chaining and transpose"); + } + Emit(isa::npu_set_ofm_precision_t(type, precision, format, scaleMode, reverse, transpose, storage)); +} + +// Generates common IFM registers +void EthosU85RCSGenerator::GenerateIFM(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue) +{ + if ( isScalar ) + { + Emit(isa::npu_set_op_scalar_t(uint32_t(scalarValue))); + } + else + { + CheckAddresses(fm); + Emit(isa::npu_set_ifm_region_t(ToRegion(fm.memArea))); + Shape strides = fm.strides; + auto tiles = GetTiles(fm, strides, inputArea); + auto boxSize = inputArea.SizeShape(); + // IFM_BASE registers + Emit(isa::npu_set_ifm_base0_t(tiles.address[0])); + Emit(isa::npu_set_ifm_base1_t(tiles.address[1])); + Emit(isa::npu_set_ifm_base2_t(tiles.address[2])); + Emit(isa::npu_set_ifm_base3_t(tiles.address[3])); + // Tile related registers + Emit(isa::npu_set_ifm_height0_m1_t(tiles.height0 - 1)); + Emit(isa::npu_set_ifm_height1_m1_t(tiles.height1 - 1)); + Emit(isa::npu_set_ifm_width0_m1_t(tiles.width0 - 1)); + Emit(isa::npu_set_ifm_depth_m1_t(boxSize.Depth() - 1)); + // IFM_STRIDE registers + Emit(isa::npu_set_ifm_stride_y_t(strides.Height() * fm.stepXY.y)); + Emit(isa::npu_set_ifm_stride_x_t(strides.Width() * fm.stepXY.x)); + Emit(isa::npu_set_ifm_stride_c_t(strides.Depth())); + } + // IFM_ZERO_POINT register + auto &quant = fm.quantization; + uint32_t zp = UseZeroPoint0(opType, fm, false) ? 0 : uint32_t(quant.zeroPoints[0]); + Emit(isa::npu_set_ifm_zero_point_t(zp)); +} + +// Generates common IFM2 registers +void EthosU85RCSGenerator::GenerateIFM2(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue) +{ + if ( isScalar ) + { + Emit(isa::npu_set_op_scalar_t(uint32_t(scalarValue))); + } + else + { + CheckAddresses(fm); + Emit(isa::npu_set_ifm2_region_t(ToRegion(fm.memArea))); + Shape strides = fm.strides; + auto tiles = GetTiles(fm, strides, inputArea); + // IFM2_BASE registers + Emit(isa::npu_set_ifm2_base0_t(tiles.address[0])); + Emit(isa::npu_set_ifm2_base1_t(tiles.address[1])); + Emit(isa::npu_set_ifm2_base2_t(tiles.address[2])); + Emit(isa::npu_set_ifm2_base3_t(tiles.address[3])); + // Tile related registers + Emit(isa::npu_set_ifm2_height0_m1_t(tiles.height0 - 1)); + Emit(isa::npu_set_ifm2_height1_m1_t(tiles.height1 - 1)); + Emit(isa::npu_set_ifm2_width0_m1_t(tiles.width0 - 1)); + // IFM2_STRIDE registers + Emit(isa::npu_set_ifm2_stride_y_t(strides.Height() * fm.stepXY.y)); + Emit(isa::npu_set_ifm2_stride_x_t(strides.Width() * fm.stepXY.x)); + Emit(isa::npu_set_ifm2_stride_c_t(strides.Depth())); + } + // IFM2_ZERO_POINT register + auto &quant = fm.quantization; + uint32_t zp = UseZeroPoint0(opType, fm, false) ? 0 : uint32_t(quant.zeroPoints[0]); + Emit(isa::npu_set_ifm2_zero_point_t(zp)); +} + +// Generates OFM registers +void EthosU85RCSGenerator::GenerateOFM(OpType opType, const HLCFeatureMap &fm, const Box &outputArea) +{ + CheckAddresses(fm); + Emit(isa::npu_set_ofm_region_t(ToRegion(fm.memArea))); + Shape strides = fm.strides; + auto tiles = GetTiles(fm, strides, outputArea); + auto boxSize = outputArea.SizeShape().Untranspose(fm.transpose); + // OFM_BASE registers + Emit(isa::npu_set_ofm_base0_t(tiles.address[0])); + Emit(isa::npu_set_ofm_base1_t(tiles.address[1])); + Emit(isa::npu_set_ofm_base2_t(tiles.address[2])); + Emit(isa::npu_set_ofm_base3_t(tiles.address[3])); + // OFM size (shape *before* transposition) //TODO: Maybe transpose stepXY. Here or in tiles? + Emit(isa::npu_set_ofm_height_m1_t(DivRoundUp(boxSize.Height(), fm.stepXY.y) - 1)); + Emit(isa::npu_set_ofm_width_m1_t(DivRoundUp(boxSize.Width(), fm.stepXY.x) - 1)); + Emit(isa::npu_set_ofm_depth_m1_t(boxSize.Depth() - 1)); + // Tile related registers (shape *after* transposition) + Emit(isa::npu_set_ofm_height0_m1_t(tiles.height0 - 1)); + Emit(isa::npu_set_ofm_height1_m1_t(tiles.height1 - 1)); + Emit(isa::npu_set_ofm_width0_m1_t(tiles.width0 - 1)); + // OFM_STRIDE registers + Emit(isa::npu_set_ofm_stride_y_t(strides.Height() * fm.stepXY.y)); + Emit(isa::npu_set_ofm_stride_x_t(strides.Width() * fm.stepXY.x)); + Emit(isa::npu_set_ofm_stride_c_t(strides.Depth())); + // OFM_ZERO_POINT register + auto &quant = fm.quantization; + uint32_t zp = UseZeroPoint0(opType, fm, true) ? 0 : uint32_t(quant.zeroPoints[0]); + Emit(isa::npu_set_ofm_zero_point_t(zp)); +} + +// Generates WEIGHT registers +void EthosU85RCSGenerator::GenerateWeights(const HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto weights = stripe->operation->weights.get(); + if ( weights == nullptr ) + { + return; + } + + EthosU85OpConfig *config = static_cast(stripe->operation->config); + + auto wgtFormat = (weights->format & WeightFormat::Fast) ? weight_format::FWD : weight_format::SWD; + auto wgtSparsity = (weights->format & WeightFormat::Sparse2_4) ? weight_sparsity::SPARSE_2_4 : weight_sparsity::NONE; + Emit(isa::npu_set_weight_format_t(wgtFormat, wgtSparsity)); + + int depth = stripe->weightRangeDepth; + Emit(isa::npu_set_weight_region_t(ToRegion(weights->memArea))); + int offset = 0; + for ( int i = 0; i < _arch->_cores; ++i ) + { + Address address = 0; + int length = 0; + auto item = weights->encodedRanges.find(WeightKey(i, depth)); + if ( item != weights->encodedRanges.end() ) + { + const auto &range = item->second; + int doubleBufferOffset = GetDoubleBufferOffset(weights, range.index); + address = weights->address + offset + range.weightOffset + doubleBufferOffset; + length = RoundAway(range.weightBytes, 16); + CheckAddressRange(weights->memArea.memory, address, length); + memoryAccesses.emplace_back(AccessDirection::Read, weights->memArea, address, address + length); + offset += RoundAway(range.TotalBytes(), 16); + } + + switch ( i ) + { + case 0: + if ( length != 0 ) Emit(isa::npu_set_weight_base_t(address)); + Emit(isa::npu_set_weight_length_t(length)); + break; + case 1: + if ( length != 0 ) Emit(isa::npu_set_weight1_base_t(address)); + Emit(isa::npu_set_weight1_length_t(length)); + break; + case 2: + if ( length != 0 ) Emit(isa::npu_set_weight2_base_t(address)); + Emit(isa::npu_set_weight2_length_t(length)); + break; + case 3: + if ( length != 0 ) Emit(isa::npu_set_weight3_base_t(address)); + Emit(isa::npu_set_weight3_length_t(length)); + break; + default: + assert(false); + } + } +} + +// Generates SCALE registers +void EthosU85RCSGenerator::GenerateScales(const HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto scales = stripe->operation->scales.get(); + if ( scales == nullptr ) + { + assert(!stripe->operation->weights); + return; + } + int depth = stripe->weightRangeDepth; + Emit(isa::npu_set_scale_region_t(ToRegion(scales->memArea))); + auto item0 = scales->encodedRanges.find(WeightKey(0, depth)); + assert(item0 != scales->encodedRanges.end()); + auto &range0 = item0->second; + int doubleBufferOffset = GetDoubleBufferOffset(scales, range0.index); + Address address = scales->address + doubleBufferOffset; + int length = RoundAway(range0.scaleBytes, 16); + + CheckAddressRange(scales->memArea.memory, address, length); + Emit(isa::npu_set_scale_base_t(address)); + Emit(isa::npu_set_scale_length_t(length)); + memoryAccesses.emplace_back(AccessDirection::Read, scales->memArea, address, address + length); +} + +// Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers +void EthosU85RCSGenerator::GenerateBlockConfig(const EthosU85OpConfig *config, const HLCFeatureMap &fm) +{ + Shape blk = config->OfmBlock(); + // Block constraints for transpose + switch ( fm.transpose ) + { + case TransposeType::NWHC: + assert(blk.Width() <= _arch->_ofmBlockMax.Height() && "Illegal OFM block height"); + assert(blk.Height() <= _arch->_ofmBlockMax.Width() && "Illegal OFM block width"); + break; + case TransposeType::NHCW: + assert(blk.Depth() <= _arch->_ofmBlockMax.Width() && "Illegal OFM block width"); + assert(blk.Width() <= _arch->_ofmBlockMax.Depth() && "Illegal OFM block depth"); + // Width must be multiple of 16 if brick format + assert((fm.format != TensorFormat::NHCWB16 || blk.Width() % 16 == 0) && "Illegal OFM block width for brick format"); + break; + case TransposeType::NCWH: + assert(blk.Depth() <= _arch->_ofmBlockMax.Height() && "Illegal OFM block height"); + assert(blk.Height() <= _arch->_ofmBlockMax.Depth() && "Illegal OFM block depth"); + // Height must be multiple of 16 if brick format + assert((fm.format != TensorFormat::NHCWB16 || blk.Height() % 16 == 0) && "Illegal OFM block height for brick format"); + break; + case TransposeType::NCHW: + assert(blk.Depth() <= _arch->_ofmBlockMax.Height() && "Illegal OFM block height"); + assert(blk.Height() <= _arch->_ofmBlockMax.Width() && "Illegal OFM block width"); + assert(blk.Width() <= _arch->_ofmBlockMax.Depth() && "Illegal OFM block depth"); + // Width must be multiple of 16 if brick format + assert((fm.format != TensorFormat::NHCWB16 || blk.Width() % 16 == 0) && "Illegal OFM block width for brick format"); + break; + case TransposeType::NWCH: + assert(blk.Width() <= _arch->_ofmBlockMax.Height() && "Illegal OFM block height"); + assert(blk.Depth() <= _arch->_ofmBlockMax.Width() && "Illegal OFM block width"); + assert(blk.Height() <= _arch->_ofmBlockMax.Depth() && "Illegal OFM block depth"); + // Height must be multiple of 16 if brick format + assert((fm.format != TensorFormat::NHCWB16 || blk.Height() % 16 == 0) && "Illegal OFM block height for brick format"); + break; + default: + break; + } + // OFM block (shape *before* transposition) + Emit(isa::npu_set_ofm_blk_height_m1_t(blk.Height() - 1)); + Emit(isa::npu_set_ofm_blk_width_m1_t(blk.Width() - 1)); + Emit(isa::npu_set_ofm_blk_depth_m1_t(blk.Depth() - 1)); +} + +// Generates ACC_FORMAT register +void EthosU85RCSGenerator::GenerateAccFormat(const EthosU85OpConfig *config) +{ + auto accType = config->_accumulatorType; + acc_format format = accType == EthosU85Accumulator::Acc32 ? acc_format::I32 : acc_format::I48; + + auto w = config->OfmUBlock().Width(); + auto h = config->OfmUBlock().Height(); + microblock block = microblock::U1X1; + + switch ( h << 4 | w ) + { + case 0x11: + block = microblock::U1X1; + break; + case 0x12: + block = microblock::U1X2; + break; + case 0x14: + block = microblock::U1X4; + break; + case 0x22: + block = microblock::U2X2; + break; + case 0x24: + block = microblock::U2X4; + break; + case 0x44: + block = microblock::U4X4; + break; + default: + assert(false && "Invalid microblock"); + } + acc_input input; + switch ( config->AccSource() ) + { + case ArchAccumulatorSource::Acc: + input = acc_input::KEEP; + break; + case ArchAccumulatorSource::Ifm2: + input = acc_input::IFM2; + break; + case ArchAccumulatorSource::Reset: + default: + input = acc_input::RESET; + } + acc_output output = config->AccOutputEnabled() ? acc_output::ENABLE : acc_output::DISABLE; + + Emit(isa::npu_set_acc_format_t(format, input, output, block)); +} + +// Calculates and generates KERNEL_WAIT or DMA_WAIT register +void EthosU85RCSGenerator::GenerateWaits(bool isKernelWait, const MemoryAccesses &memoryAccesses, int maxWaits, + std::deque &outstandingAccesses, std::deque &accessesToUpdate) +{ + int waits = CalcCommandWaits(memoryAccesses, outstandingAccesses); + if ( waits >= 0 ) + { + if ( isKernelWait ) + { + Emit(isa::npu_op_kernel_wait_t(waits)); + } + else + { + Emit(isa::npu_op_dma_wait_t(waits)); + } + } + accessesToUpdate.push_back(memoryAccesses); + if ( int(accessesToUpdate.size()) > maxWaits ) + { + accessesToUpdate.pop_front(); + } +} + +// Inserts DMA commands for copying LUTs from constant memory +// to LUT memory +std::vector> +EthosU85RCSGenerator::InsertLUTDMACommands(std::vector> &cmds) +{ + std::vector> result; + int lutSlotSize = ArchEthosU85::LUT_SLOT_SIZE; + int slots = int(_arch->_lutRam->SizeBytes() / lutSlotSize); + std::vector lutSlots(slots); + int timestamp = 0; + result.reserve(cmds.size()); + for ( auto &hlc : cmds ) + { + ++timestamp; + if ( hlc->IsStripe() ) + { + auto stripe = static_cast(hlc.get()); + auto op = stripe->operation; + auto config = static_cast(op->config); + if ( !op->subOps.empty() && op->subOps[0].type == OpType::LUT ) + { + const auto &srcTens = op->subOps[0].parameters.lut; + assert(srcTens.sizeBytes % lutSlotSize == 0); + bool alreadyInLutMem; + int sizeInSlots = srcTens.sizeBytes / lutSlotSize; + int slot = AllocateLutSlot(lutSlots, op.get(), sizeInSlots, timestamp, alreadyInLutMem); + _stripeToLutSlot[stripe] = slot; + + if ( !alreadyInLutMem ) + { + auto dma = std::make_unique(); + dma->srcMemArea = srcTens.memArea; + dma->srcAddress = srcTens.address; + dma->length = srcTens.sizeBytes; + dma->destMemArea = _arch->LUTMemory(); + dma->destAddress = slot * lutSlotSize; + result.push_back(std::move(dma)); + } + } + } + result.push_back(std::move(hlc)); + } + return result; +} + +//---------------------------------------------------------------------- +// Operations +//---------------------------------------------------------------------- + +// Generates NPU_OP_* command +void EthosU85RCSGenerator::GenerateOperationCode(const HLCOperation *op) +{ + auto opType = op->type; + if ( opType == OpType::Resize ) + { + resize_mode mode = ToResizeMode(op->parameters.resize.mode); + Emit(isa::npu_op_resize_t(mode)); + } + else if ( IsPooling(opType) ) + { + pooling_mode mode; + if ( opType == OpType::AvgPool ) + { + auto kernelSize = op->kernel.Size(); + // SUM when kernel size > 8x8 + mode = (kernelSize.x <= 8 && kernelSize.y <= 8) ? pooling_mode::AVERAGE : pooling_mode::SUM; + } + else if ( opType == OpType::MaxPool ) + { + mode = pooling_mode::MAX; + } + else if ( opType == OpType::ArgMax ) + { + auto axis = op->parameters.argmax.axis; + assert(axis == 1 || axis == 2); + mode = axis == 1 ? pooling_mode::ARGMAX_Y : pooling_mode::ARGMAX_X; + } + else + { + assert(opType == OpType::ReduceSum); + mode = pooling_mode::REDUCE_SUM; + } + Emit(isa::npu_op_pool_t(mode)); + } + else if ( IsDepthwise(opType) ) + { + Emit(isa::npu_op_depthwise_t()); + } + else if ( IsConvolution(opType) || IsVectorProduct(opType) ) + { + // Dynamic weights when op->ifm.size() == 2, _weights_ifm2 parameter should be True + Emit(isa::npu_op_conv_t(op->ifm.size() == 2)); + } + else if ( IsElementwise(opType) ) + { + const auto &item = kElementwiseMap.find(opType); + if ( item == kElementwiseMap.end() ) + { + assert(false && "Unsupported elementwise operator"); + } + else + { + Emit(isa::npu_op_elementwise_t(item->second)); + } + } + else if ( IsDma(opType) ) + { + Emit(isa::npu_op_dma_start_t()); + } + else if ( _arch->UseAvgPoolNop(opType) ) + { + // Implemented using SUM + Emit(isa::npu_op_pool_t(pooling_mode::SUM)); + } + else + { + assert(false && "Unsupported operator"); + } +} + +void EthosU85RCSGenerator::GenerateCommon(const HLCStripe *stripe, bool useGlobalScale, MemoryAccesses &memoryAccesses) +{ + auto op = stripe->operation.get(); + int32_t scalarValue = 0; + bool isScalar = IsScalar(op->ifm[0], scalarValue) && IsElementwise(op->type); + GenerateIFMPrecision(op->ifm[0]); + GenerateIFM(op->type, op->ifm[0], stripe->ifmAreas[0], isScalar, scalarValue); + if ( !isScalar ) + { + memoryAccesses.push_back(ToMemoryAccess(op->ifm[0], stripe->ifmAreas[0], AccessDirection::Read)); + } + ifm_upscale_mode upscaleMode = ToIfmUpscaleMode(op->ifm[0].resamplingMode); + Emit(isa::npu_set_ifm_upscale_t(upscaleMode)); + if ( !IsElementwise(op->type) ) + { + GeneratePadding(stripe->padding); + } + GenerateOFM(op->type, op->ofm, stripe->ofmArea); + memoryAccesses.push_back(ToMemoryAccess(op->ofm, stripe->ofmArea, AccessDirection::Write)); + EthosU85OpConfig *config = static_cast(stripe->operation->config); + GenerateOFMPrecision(op->ofm, useGlobalScale, config->AccOutputEnabled()); + if ( !IsElementwise(op->type) && op->type != OpType::Resize ) + { + GenerateKernel(op->kernel, config->Traversal() == EthosU85Traversal::PartKernel); + } + GenerateWeights(stripe, memoryAccesses); + GenerateScales(stripe, memoryAccesses); + GenerateActivation(stripe, memoryAccesses); +} + +// Conv2D/Depthwise operations +void EthosU85RCSGenerator::GenerateConvolutionOp(const HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto op = stripe->operation.get(); + QuantizedScale ofmScale(1, 0); + bool useGlobalScale = false; + ethosU85Scaling::RescaleConvolution(op); + + if ( op->ifm.size() == 2 ) + { + // Dynamic weights + useGlobalScale = true; + GenerateIFM2(op->type, op->ifm[1], stripe->ifmAreas[1], false, 0); + GenerateIFM2Precision(op->ifm[1]); + Emit(isa::npu_set_weight_format_t(weight_format::SWD, weight_sparsity::NONE)); // Reset weight format + } + + if ( !op->ofm.quantization.scales.empty() ) + { + ofmScale = op->ofm.quantization.scales[0]; + assert(unsigned(ofmScale.shift) < 64); + } + Emit(isa::npu_set_ofm_scale_t(ofmScale.shift, 0, GetOfmRoundingMode(op), ofmScale.scale)); + GenerateCommon(stripe, useGlobalScale, memoryAccesses); +} + +// MaxPool/AvgPool or operations that are mapped to AvgPool +void EthosU85RCSGenerator::GeneratePoolingOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto op = stripe->operation.get(); + bool useGlobalScale = true; // TODO: Add any per channel scaling modes + if ( _arch->UseAvgPoolNop(op->type) ) + { + assert(op->kernel.Size() == Point2i(1, 1)); + assert(op->kernel.Stride() == Point2i(1, 1)); + assert(op->kernel.Dilation() == Point2i(1, 1)); + assert(op->kernel.DepthMultiplier() == 1); + assert(useGlobalScale); + } + GenerateCommon(stripe, useGlobalScale, memoryAccesses); + if ( useGlobalScale ) + { + GenerateOFMScalingForPooling(op); + } +} + +// Elementwise operations +void EthosU85RCSGenerator::GenerateElementwiseOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto op = stripe->operation.get(); + auto opType = op->type; + constexpr bool useGlobalScale = true; + if ( IsUnaryElementwise(opType) ) + { + assert(op->ifm.size() == 1); + GenerateScalingForElementwise(op); + GenerateCommon(stripe, useGlobalScale, memoryAccesses); + } + else + { + // Binary operation: generate IFM2 registers + assert(op->ifm.size() == 2); + assert(stripe->ifmAreas.size() == 2); + int32_t scalarValue = 0; + auto ifmShape = stripe->ifmAreas[0].SizeShape(); + auto ifm2Shape = stripe->ifmAreas[1].SizeShape(); + GenerateScalingForElementwise(op); + GenerateCommon(stripe, useGlobalScale, memoryAccesses); + bool ifmIsScalar = IsScalar(op->ifm[0], scalarValue); + bool ifm2IsScalar = IsScalar(op->ifm[1], scalarValue); + GenerateIFM2(opType, op->ifm[1], stripe->ifmAreas[1], ifm2IsScalar, scalarValue); + if ( !ifm2IsScalar ) + { + memoryAccesses.push_back(ToMemoryAccess(op->ifm[1], stripe->ifmAreas[1], AccessDirection::Read)); + } + GenerateIFM2Precision(op->ifm[1]); + GenerateInputBroadcast(ifmShape, ifm2Shape, ifmIsScalar, ifm2IsScalar); + } +} + +// Resize operations +void EthosU85RCSGenerator::GenerateResizeOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto op = stripe->operation.get(); + auto opType = op->type; + constexpr bool useGlobalScale = true; + auto *config = static_cast(op->config); + Shape ofmBlock = config->_ofmBlock; + + auto ifmShape = stripe->ifmAreas[0].SizeShape(); + + // operator-parameters + const HLCParameters *params = &op->parameters; + const auto &scale_w = params->resize.scaleX; + const auto &scale_h = params->resize.scaleY; + int offset_h = params->resize.offsetY; + int offset_w = params->resize.offsetX; + + round_mode_ofm roundMode = GetOfmRoundingMode(op); + + // scaling is shift only and + 16 + QuantizedScale ofmScale = op->ofm.quantization.scales[0]; + int shift = 16 + ofmScale.shift; + + // X - width + int one_step_int_w = scale_w.d / scale_w.n; + int one_step_mod_w = scale_w.d % scale_w.n; + int blk_step_int_w = ((ofmBlock.Width() - 1) * scale_w.d) / scale_w.n; + int blk_step_mod_w = ((ofmBlock.Width() - 1) * scale_w.d) % scale_w.n; + + // Y - height + int one_step_int_h = scale_h.d / scale_h.n; + int one_step_mod_h = scale_h.d % scale_h.n; + int blk_step_int_h = ((ofmBlock.Height() - 1) * scale_h.d) / scale_h.n; + int blk_step_mod_h = ((ofmBlock.Height() - 1) * scale_h.d) % scale_h.n; + + // asserts + assert(shift < (1 << 6)); + assert(ofmScale.scale == 1); + assert(scale_w.n <= 2048); + assert(scale_h.n <= 2048); + assert(-scale_h.n <= offset_h); + assert(offset_h < scale_h.n); + assert(one_step_mod_h < scale_h.n); + assert(-scale_w.n <= offset_w); + assert(offset_w < scale_w.n); + assert(one_step_mod_w < scale_w.n); + assert(ToIfmUpscaleMode(op->ifm[0].resamplingMode) == ifm_upscale_mode::NONE); + assert(ofmBlock.Height() == 1); + assert(ToActivationTranspose(op->ofm.transpose) == activation_transpose::HWC); + + GenerateCommon(stripe, useGlobalScale, memoryAccesses); + + // Resize requires ifm2_zero_point 0 + Emit(isa::npu_set_ifm2_zero_point_t(0)); + Emit(isa::npu_set_ofm_scale_t(16 + ofmScale.shift, 0, roundMode, 1)); + + // Resize specific registers + Emit(isa::npu_set_resize_x_scale_n_m1_t(scale_w.n - 1)); + Emit(isa::npu_set_resize_y_scale_n_m1_t(scale_h.n - 1)); + Emit(isa::npu_set_resize_x_step_t(one_step_int_w, blk_step_int_w, one_step_mod_w, blk_step_mod_w)); + Emit(isa::npu_set_resize_y_step_t(one_step_int_h, blk_step_int_h, one_step_mod_h, blk_step_mod_h)); + Emit(isa::npu_set_resize_x_offset_t(offset_w)); + Emit(isa::npu_set_resize_y_offset_t(offset_h)); + Emit(isa::npu_set_kernel_height_m1_t(ifmShape.Height() - 1)); + Emit(isa::npu_set_kernel_width_m1_t(ifmShape.Width() - 1)); +} + +bool EthosU85RCSGenerator::GenerateStripe(HLCStripe *stripe, MemoryAccesses &memoryAccesses) +{ + auto opType = stripe->operation->type; + EthosU85NpuOp npuOp = ArchEthosU85::GetHWOp(opType); + + if ( npuOp == EthosU85NpuOp::Pooling || npuOp == EthosU85NpuOp::ReduceSum ) + { + GeneratePoolingOp(stripe, memoryAccesses); + } + else if ( npuOp == EthosU85NpuOp::Depthwise || npuOp == EthosU85NpuOp::Convolution || npuOp == EthosU85NpuOp::VectorProduct ) + { + GenerateConvolutionOp(stripe, memoryAccesses); + } + else if ( npuOp == EthosU85NpuOp::Elementwise ) + { + GenerateElementwiseOp(stripe, memoryAccesses); + } + else if ( npuOp == EthosU85NpuOp::Resize ) + { + GenerateResizeOp(stripe, memoryAccesses); + } + else + { + LOG_ERROR("Register command stream generator: unsupported operator '{}'\n", OpTypeToString(opType)); + assert(false); + return false; + } + EthosU85OpConfig *config = static_cast(stripe->operation->config); + GenerateBlockConfig(config, stripe->operation->ofm); + GenerateAccFormat(config); + return true; +} + +// Generates register commands for DMA operations +void EthosU85RCSGenerator::GenerateDMA(const HLCDMA *dma, MemoryAccesses &memoryAccesses) +{ + dma_region_mode srcRegionMode = dma->srcMemArea == _arch->LUTMemory() ? dma_region_mode::INTERNAL : dma_region_mode::EXTERNAL; + dma_region_mode destRegionMode = dma->destMemArea == _arch->LUTMemory() ? dma_region_mode::INTERNAL : dma_region_mode::EXTERNAL; + + uint32_t size0 = dma->sizes.Size() > 0 ? dma->sizes[-1] : 1; + uint32_t size1 = dma->sizes.Size() > 1 ? dma->sizes[-2] : 1; + uint64_t srcStride0 = dma->srcStrides.Size() > 1 ? dma->srcStrides[-2] : 0; + uint64_t srcStride1 = dma->srcStrides.Size() > 2 ? dma->srcStrides[-3] : 0; + uint64_t destStride0 = dma->destStrides.Size() > 1 ? dma->destStrides[-2] : 0; + uint64_t destStride1 = dma->destStrides.Size() > 2 ? dma->destStrides[-3] : 0; + + dma_stride_mode srcStrideMode; + if ( size1 > 1 ) srcStrideMode = dma_stride_mode::D3; + else if ( size0 > 1 ) srcStrideMode = dma_stride_mode::D2; + else srcStrideMode = dma_stride_mode::D1; + + dma_idx_mode srcIndexMode = dma->srcIndexed ? dma_idx_mode::ENABLED : dma_idx_mode::DISABLED; + dma_idx_mode destIndexMode = dma->destIndexed ? dma_idx_mode::ENABLED : dma_idx_mode::DISABLED; + assert(!(srcIndexMode == dma_idx_mode::ENABLED && destIndexMode == dma_idx_mode::ENABLED)); + + // Registers for 1D, 2D and 3D mode + Emit(isa::npu_set_dma0_src_region_t(ToRegion(dma->srcMemArea), srcRegionMode, srcStrideMode, srcIndexMode)); + Emit(isa::npu_set_dma0_src_t(dma->srcAddress)); + Emit(isa::npu_set_dma0_dst_region_t(ToRegion(dma->destMemArea), destRegionMode, destIndexMode)); + Emit(isa::npu_set_dma0_dst_t(dma->destAddress)); + assert(dma->length > 0); + Emit(isa::npu_set_dma0_len_t(dma->length)); + + if ( srcStrideMode != dma_stride_mode::D1 ) + { + // Registers for 2D and 3D mode + assert(size0 > 0); + Emit(isa::npu_set_dma0_size0_t(size0)); + } + + if ( srcStrideMode != dma_stride_mode::D1 || dma->srcIndexed ) + { + // Registers for 2D and 3D mode, or src indexed operation + Emit(isa::npu_set_dma0_src_stride0_t(srcStride0)); + } + + if ( srcStrideMode != dma_stride_mode::D1 || dma->destIndexed ) + { + // Registers for 2D and 3D mode, or dest indexed operation + Emit(isa::npu_set_dma0_dst_stride0_t(destStride0)); + } + + if ( srcStrideMode == dma_stride_mode::D3 ) + { + // Registers for 3D mode + assert(size1 > 0); + Emit(isa::npu_set_dma0_size1_t(size1)); + Emit(isa::npu_set_dma0_src_stride1_t(srcStride1)); + Emit(isa::npu_set_dma0_dst_stride1_t(destStride1)); + } + + if ( dma->srcIndexed || dma->destIndexed ) + { + // Registers for indexed operation + Emit(isa::npu_set_dma0_idx_region_t(ToRegion(dma->idxMemArea))); + assert(dma->idxMax > 0); + Emit(isa::npu_set_dma0_idx_max_t(dma->idxMax)); + Emit(isa::npu_set_dma0_idx_t(dma->idxAddress)); + } + + if ( srcStrideMode == dma_stride_mode::D3 && (dma->srcIndexed || dma->destIndexed) ) + { + Emit(isa::npu_set_dma0_idx_skip1_t(dma->idxSkip1)); + } + + if ( srcStrideMode == dma_stride_mode::D1 ) + { + // Address accesses for 1D mode + CheckAddressRange(dma->srcMemArea.memory, dma->srcAddress, dma->length); + CheckAddressRange(dma->destMemArea.memory, dma->destAddress, dma->length); + memoryAccesses.emplace_back(AccessDirection::Read, dma->srcMemArea, dma->srcAddress, dma->srcAddress + dma->length); + memoryAccesses.emplace_back(AccessDirection::Write, dma->destMemArea, dma->destAddress, dma->destAddress + dma->length); + } + else + { + // Address accesses for 2D and 3D mode + CheckAddressRange(dma->srcMemArea.memory, dma->srcAddress, dma->srcStrides[0]); + CheckAddressRange(dma->destMemArea.memory, dma->destAddress, dma->destStrides[0]); + memoryAccesses.emplace_back(AccessDirection::Read, dma->srcMemArea, dma->srcAddress, dma->srcAddress + dma->srcStrides[0]); + memoryAccesses.emplace_back(AccessDirection::Write, dma->destMemArea, dma->destAddress, dma->destAddress + dma->destStrides[0]); + } + + if ( dma->srcIndexed || dma->destIndexed ) + { + // Address accesses for indexed operation + CheckAddressRange(dma->idxMemArea.memory, dma->idxAddress, size0 * size1); + memoryAccesses.emplace_back(AccessDirection::Read, dma->idxMemArea, dma->idxAddress, dma->idxAddress + size0 * size1); + } +} + +std::vector EthosU85RCSGenerator::GenerateCommandStream(std::vector> &highLevelCommandStream, + std::vector> *cmdRanges, bool verbose) +{ + _emit.Clear(); + _stripeToLutSlot.clear(); + GenerateInitialRegisterSetup(); + auto cmds = InsertLUTDMACommands(highLevelCommandStream); + std::deque outstandingDmaAccesses; + std::deque outstandingNpuAccesses; + int maxOutstandingDMAOps = _arch->MaxOutstandingDMAOps(); + int maxOutstandingKernelOps = _arch->MaxOutstandingKernelOps(); + HLCStripe *prevOp = nullptr; + std::vector> debugInfo; + for ( auto &hlc : cmds ) + { + MemoryAccesses memoryAccesses; + int emitStart = _emit.Position(); + if ( hlc->IsStripe() ) + { + auto stripe = static_cast(hlc.get()); + if ( verbose ) + { + debugInfo.emplace_back(emitStart, stripe->operation->ToString()); + } + if ( !GenerateStripe(stripe, memoryAccesses) ) + { + return std::vector(); + } + // BLOCKDEP register + int blockdep = CalcBlockDep(prevOp, stripe); + Emit(isa::npu_set_blockdep_t(blockdep)); + GenerateWaits(false, memoryAccesses, maxOutstandingKernelOps, outstandingDmaAccesses, outstandingNpuAccesses); + GenerateOperationCode(stripe->operation.get()); + prevOp = stripe; + // Return command mapping information to the caller + int emitEnd = _emit.Position(); + if ( cmdRanges ) + { + cmdRanges->emplace_back(stripe->operation->_srcKey, emitStart, emitEnd); + } + } + else + { + auto dma = static_cast(hlc.get()); + if ( verbose ) + { + debugInfo.emplace_back(emitStart, dma->ToString()); + } + GenerateDMA(dma, memoryAccesses); + GenerateWaits(true, memoryAccesses, maxOutstandingDMAOps, outstandingNpuAccesses, outstandingDmaAccesses); + Emit(isa::npu_op_dma_start_t()); + } + } + Emit(isa::npu_op_stop_t(0xFFFF)); + if ( verbose ) + { + PrintCommandStream(_emit.CommandStream(), debugInfo); + } + return _emit.CommandStream(); +} + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp new file mode 100644 index 00000000..6c057550 --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_register_cs_generator.hpp @@ -0,0 +1,250 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/architecture.hpp" +#include "architecture/ethos_u_register_cs_generator.hpp" +#include "ethos_u85.hpp" + +#include +#include +#include +#include + +namespace regor +{ + +class EthosU85Emitter +{ +public: + EthosU85Emitter() = default; + void Emit(uint32_t instr); + void Emit(uint64_t instr); + void Clear(); + int Position() const { return int(_stream.size()); } + const std::vector &CommandStream() const { return _stream; } + +private: + bool SetRegister(uint16_t reg, uint64_t value); + static bool IsCmd0(uint16_t key); + static bool IsCmd1(uint16_t key); + static bool IsOp(uint16_t key); + std::vector _stream; + std::unordered_map _registers; +}; + + +// Specifies the addresses and dimensions of the tiles of a feature map. +// A feature map can use 1 to 4 tiles +struct TileBox +{ + int height0; // The height of tile 0 + int height1; // The height of tile 1 + int width0; // The width of tile 0, and tile 2 (if used) + Address address[4]; // Tile addresses +}; + +enum BasePointerIndex +{ + WeightTensor = 0, // base address index for the Weight tensor + ScratchTensor = 1, // base address index for the Scratch_tensor in the TensorArena + ScratchFastTensor = 2, // base address for the Scratch_fast_tensor + Mem2Mem = 3, // base address slot for memory to memory transfer +}; + +enum class AccessDirection +{ + Read = 0, + Write = 1, +}; + +struct MemoryAccess +{ + AccessDirection direction; + MemArea memArea; + Address start; + Address end; + + MemoryAccess(AccessDirection direction_, MemArea area_, Address start_, Address end_) : + direction(direction_), memArea(area_), start(start_), end(end_) + { + } + + bool Conflicts(const MemoryAccess &other) const + { + bool overlaps = Overlaps(start, end, other.start, other.end) && memArea == other.memArea; + return overlaps && (direction != AccessDirection::Read || other.direction != AccessDirection::Read); + } +}; + +using MemoryAccesses = std::vector; + +struct LutSlot +{ + const HLCOperation *hlcOp = nullptr; + int lastUsed = 0; +}; + +/// +/// Generates register command streams for Ethos U85 +/// +class EthosU85RCSGenerator : public EthosURegisterCSGenerator +{ + +public: + EthosU85RCSGenerator(ArchEthosU85 *arch); + + //---------------------------------------------------------------------- + // Print + //---------------------------------------------------------------------- + + int Disassemble(const uint32_t *in, std::string &op, std::vector> &fields); + +protected: + //---------------------------------------------------------------------- + // Helper functions + //---------------------------------------------------------------------- + + void Emit(uint32_t instr); + void Emit(uint64_t instr); + + static int GetDoubleBufferOffset(HLCWeights *weights, int rangeIndex); + static void CheckAddressRange(ArchitectureMemory *memory, Address address, int size); + static void CheckAddresses(const HLCFeatureMap &fm); + // Calculates the rolling buffer address of the given coordinate. + static Address AddressForCoordinate(const HLCFeatureMap &fm, const Shape &strides, const Shape &coord); + // Calculates tile sizes/addresses of a feature map + static TileBox GetTiles(const HLCFeatureMap &fm, const Shape &strides, const Box &area); + MemoryAccess ToMemoryAccess(const HLCFeatureMap &fm, const Box &area, AccessDirection direction); + // Returns region number used in NPU_SET_..._REGION + uint32_t ToRegion(const MemArea &memArea); + static bool UseZeroPoint0(OpType opType, const HLCFeatureMap &fm, bool isOFM); + // Checks if the feature map is a scalar, and if so, returns the + // quantized value in scalarValue. + static bool IsScalar(const HLCFeatureMap &fm, int32_t &scalarValue); + // Calculates waits for KERNEL_WAIT/DMA_WAIT, returns -1 if no wait is needed + // - opAccesses contains the memory accesses for the current operation + // - outstanding contains the memory accesses for ongoing "other" operations + // (DMA operations if the current op is an NPU operation, NPU operations if the current op is a DMA operation) + // Note: NPU - NPU dependency is handled via blockdep + static int CalcCommandWaits(const MemoryAccesses &opAccesses, std::deque &outstanding); + // Returns LUT slot to be used for the given LUT operation. + // Sets alreadyInLutMem to true if the LUT is already in SHRAM. + int AllocateLutSlot(std::vector &lutSlots, const HLCOperation *op, int sizeInSlots, int timestamp, bool &alreadyInLutMem); + //---------------------------------------------------------------------- + // Scaling (OFM/IFM/IFM2_SCALE) + //---------------------------------------------------------------------- + + // Generates OFM_SCALE register for pooling operations + void GenerateOFMScalingForPooling(HLCOperation *poolOp); + // Generates OFM/IFM/IFM2_SCALE registers for elementwise operators. + void GenerateScalingForElementwise(HLCOperation *op); + + + + //---------------------------------------------------------------------- + // BLOCKDEP calculation + //---------------------------------------------------------------------- + + // Given the area and block size, adds the first/last jobs (depending on fromStart) to jobs. + // - area: total amount of work to perform + // - block: size of each job + // - fromStart: if true, the first jobs are added, if false, the last jobs are added + // (in that case, the very last job is added last) + void GetJobs(const Box &area, const Shape &block, int nrJobsToGet, bool fromStart, std::vector &jobs); + // Calculates the value for the BLOCKDEP register + int CalcBlockDep(HLCStripe *prevStripe, HLCStripe *stripe); + + //---------------------------------------------------------------------- + // Register generation + //---------------------------------------------------------------------- + + void GeneratePadding(const HLCPadding &padding); + // Generates ACTIVATION registers + void GenerateActivation(const HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Generates KERNEL related registers + void GenerateKernel(const Kernel &kernel, bool partKernel); + // Generates IFM_BROADCAST and IFM2_BROADCAST register for binary elementwise operations + void GenerateInputBroadcast(const Shape &ifmShape, const Shape &ifm2Shape, bool ifmIsScalar, bool ifm2IsScalar); + // Generates IFM_PRECISION register + void GenerateIFMPrecision(const HLCFeatureMap &fm); + // Generates IFM2_PRECISION register + void GenerateIFM2Precision(const HLCFeatureMap &fm); + // Generates OFM_PRECISION register + void GenerateOFMPrecision(const HLCFeatureMap &fm, bool useGlobalScale, bool enable_output); + // Generates common IFM registers + void GenerateIFM(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue); + // Generates common IFM2 registers + void GenerateIFM2(OpType opType, const HLCFeatureMap &fm, const Box &inputArea, bool isScalar, int32_t scalarValue); + // Generates OFM registers + void GenerateOFM(OpType opType, const HLCFeatureMap &fm, const Box &outputArea); + // Generates WEIGHT registers + void GenerateWeights(const HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Generates SCALE registers + void GenerateScales(const HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Generates OFM_BLK_HEIGHT/WIDTH/DEPTH registers + void GenerateBlockConfig(const EthosU85OpConfig *config, const HLCFeatureMap &fm); + // Generates ACC_FORMAT register + void GenerateAccFormat(const EthosU85OpConfig *config); + // Calculates and generates KERNEL_WAIT or DMA_WAIT register + void GenerateWaits(bool isKernelWait, const MemoryAccesses &memoryAccesses, int maxWaits, + std::deque &outstandingAccesses, std::deque &accessesToUpdate); + // Inserts DMA commands for copying LUTs from constant memory + // to LUT memory + std::vector> InsertLUTDMACommands(std::vector> &cmds); + + //---------------------------------------------------------------------- + // Operations + //---------------------------------------------------------------------- + + // Generates NPU_OP_* command + void GenerateOperationCode(const HLCOperation *op); + void GenerateCommon(const HLCStripe *stripe, bool useGlobalScale, MemoryAccesses &memoryAccesses); + // Conv2D/Depthwise operations + void GenerateConvolutionOp(const HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // MaxPool/AvgPool or operations that are mapped to AvgPool + void GeneratePoolingOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Elementwise operations + void GenerateElementwiseOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Resize operations + void GenerateResizeOp(HLCStripe *stripe, MemoryAccesses &memoryAccesses); + bool GenerateStripe(HLCStripe *stripe, MemoryAccesses &memoryAccesses); + // Generates register commands for DMA operations + void GenerateDMA(const HLCDMA *dma, MemoryAccesses &memoryAccesses); + + virtual void GenerateInitialRegisterSetup() + { + // No special initial setup for Ethos U85 + } + +public: + std::vector GenerateCommandStream(std::vector> &highLevelCommandStream, + std::vector> *cmdRanges, bool verbose) override; + static uint32_t ConfigRegister(int macs, int cmdStreamVersion, int numAxiSram, int numAxiExt, int numWd, int product); + static bool IsSupportedElementwise(const OpType opType); + static uint32_t IdRegister(); + +private: + ArchEthosU85 *_arch; + // For stripes that use LUT: the LUT slot to be used + std::unordered_map _stripeToLutSlot; + EthosU85Emitter _emit; +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp new file mode 100644 index 00000000..0056dc8d --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.cpp @@ -0,0 +1,327 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u85_scaling.hpp" + +#include "common/logging.hpp" + +#include "architecture/ethos_u_scaling.hpp" +#include "compiler/high_level_command_stream.hpp" +#include "compiler/op_type.hpp" +#include "compiler/quantization.hpp" + +namespace regor::ethosU85Scaling +{ +namespace +{ +void AdvancedElementwiseAddSubScale(double input1Scale, double input2Scale, double outputScale, int bitDepth, + QuantizedScale &input1Rescale, QuantizedScale &input2Rescale, QuantizedScale &outScale) +{ + int inputShift = bitDepth == 8 ? 20 : 15; + double ifm1Rescale; + double ifm2Rescale; + SimplifiedElementwiseAddSubScale(input1Scale, input2Scale, outputScale, inputShift, ifm1Rescale, ifm2Rescale, outScale); + input1Rescale = QuantizedScale(ifm1Rescale); + input2Rescale = QuantizedScale(ifm2Rescale); +} + +float GetScale(const Quantization *quant) +{ + if ( quant != nullptr && quant->scales.size() != 0 ) + { + // Use single precision to match reference + return float(quant->scales[0].Dequantize()); + } + else + { + return 1.0f; + } +} + +} // namespace + +void RescaleConvolution(HLCOperation *op) +{ + int ifmCnt = int(op->ifm.size()); + Quantization *ifm1Quant = &op->ifm[0].quantization; + Quantization *ifm2Quant = ifmCnt == 2 ? &op->ifm[1].quantization : nullptr; + Quantization *ofmQuant = &op->ofm.quantization; + + if ( ofmQuant->type == QuantizationType::EXPLICIT ) + { + return; + } + + QuantizedScale outScale(1, 0); + + double ifm1Scale = GetScale(ifm1Quant); + double ifm2Scale = GetScale(ifm2Quant); + double ofmScale = GetScale(ofmQuant); + + DataType ifmDataType = op->ifm[0].dataType; + OpType opType = op->type; + + bool allHaveScale = + (!ifm1Quant->scales.empty() && !ofmQuant->scales.empty() && ifm2Quant != nullptr && !ifm2Quant->scales.empty()); + + bool reducedScale = DataTypeSizeBits(ifmDataType) != 8; + + // If ifmCnt is 2 then it is a convolution with dynamic weights and global scale is used + if ( ifmCnt == 2 && allHaveScale ) + { + if ( reducedScale ) + { + outScale = QuantizedScale((ifm1Scale * ifm2Scale) / ofmScale, true); + } + else + { + outScale = ElementwiseMulScale(ifm1Scale, ifm2Scale, ofmScale); + } + } + if ( ofmQuant != nullptr && ofmQuant->type == QuantizationType::TFLITE ) + { + ofmQuant->scales.clear(); + ofmQuant->scales.push_back(outScale); + ofmQuant->type = QuantizationType::EXPLICIT; + } +} + +void RescaleElementwise(HLCOperation *op) +{ + int ifmCnt = int(op->ifm.size()); + Quantization *ifm1Quant = &op->ifm[0].quantization; + Quantization *ifm2Quant = ifmCnt == 2 ? &op->ifm[1].quantization : nullptr; + Quantization *ofmQuant = &op->ofm.quantization; + + if ( ifm1Quant->type == QuantizationType::EXPLICIT && ofmQuant->type == QuantizationType::EXPLICIT && + (ifm2Quant == nullptr || ifm2Quant->type == QuantizationType::EXPLICIT) ) + { + return; + } + + QuantizedScale input1Scale(1, 0); + QuantizedScale input2Scale(1, 0); + QuantizedScale outScale(1, 0); + + double ifm1Scale = GetScale(ifm1Quant); + double ifm2Scale = GetScale(ifm2Quant); + double ofmScale = GetScale(ofmQuant); + + DataType ifmDataType = op->ifm[0].dataType; + OpType opType = op->type; + + bool allHaveScale = + (!ifm1Quant->scales.empty() && !ofmQuant->scales.empty() && ifm2Quant != nullptr && !ifm2Quant->scales.empty()); + + int bitDepth = DataTypeSizeBits(ifmDataType); + if ( opType == OpType::Div ) + { + // Div scales should always be Unit + } + else if ( opType == OpType::Mul ) + { + if ( allHaveScale ) + { + outScale = ElementwiseMulScale(ifm1Scale, ifm2Scale, ofmScale); + } + } + else if ( opType == OpType::Abs || opType == OpType::LeakyRelu ) + { + if ( opType == OpType::LeakyRelu ) + { + input1Scale = QuantizedScale(ifm1Scale / ofmScale); + } + } + else if ( opType == OpType::Add || opType == OpType::Sub ) + { + if ( allHaveScale ) + { + AdvancedElementwiseAddSubScale(ifm1Scale, ifm2Scale, ofmScale, bitDepth, input1Scale, input2Scale, outScale); + } + } + + if ( ifm1Quant != nullptr && ifm1Quant->type == QuantizationType::TFLITE ) + { + ifm1Quant->scales.clear(); + ifm1Quant->scales.push_back(input1Scale); + ifm1Quant->type = QuantizationType::EXPLICIT; + } + if ( ifm2Quant != nullptr && ifm2Quant->type == QuantizationType::TFLITE ) + { + ifm2Quant->scales.clear(); + ifm2Quant->scales.push_back(input2Scale); + ifm2Quant->type = QuantizationType::EXPLICIT; + } + if ( ofmQuant != nullptr && ofmQuant->type == QuantizationType::TFLITE ) + { + ofmQuant->scales.clear(); + ofmQuant->scales.push_back(outScale); + ofmQuant->type = QuantizationType::EXPLICIT; + } +} + +void RescalePooling(HLCOperation *op, bool isNoOp) +{ + Quantization *ifm1Quant = &op->ifm[0].quantization; + Quantization *ofmQuant = &op->ofm.quantization; + uint32_t scale = 1; + int shift = 0; + DataType ifmDataType = op->ifm[0].dataType; + OpType opType = op->type; + + if ( ofmQuant->type != QuantizationType::TFLITE ) + { + // Explicit scaling + return; + } + + if ( opType == OpType::MaxPool || opType == OpType::ArgMax ) + { + // Do nothing + } + else if ( !ifm1Quant->scales.empty() && !ofmQuant->scales.empty() ) + { + double ifmScale = GetScale(ifm1Quant); + double ofmScale = GetScale(ofmQuant); + if ( opType == OpType::Sigmoid || opType == OpType::Tanh ) + { + assert(ifmDataType == DataType::Int16); + double rescale = 0x3000 * ifmScale; + // Calculate scale and shift for the output scale of 1/(3*4096) + double xLog2 = std::log2(ifmScale); + int roundedLog2 = int(std::round(xLog2)); + bool isPowerOf2 = std::abs(xLog2 - roundedLog2) < 0.001; + shift = roundedLog2 + 12; + if ( isPowerOf2 && ((opType == OpType::Tanh && (shift == 0 || shift == 1)) || (opType == OpType::Sigmoid && shift == 0)) ) + { + // Special handling if input scale is 1/2048 or 1/4096 + scale = 3 << shift; + shift = 0; + } + else + { + shift = 0; + int maxRescale = 16384; + while ( rescale < maxRescale && shift <= 30 ) + { + shift++; + rescale *= 2; + } + scale = uint32_t(rescale); + } + } + else if ( opType == OpType::MemoryCopy ) + { + double rescale = ifmScale / ofmScale; + // In the case of concat or other memory operation, rescaling might be needed. + // The scale is maximised, to get maximum precision + QuantizePoolingScaleMaxPrecision(op->kernel.ElementsWH(), rescale, scale, shift, 31); + } + else if ( opType == OpType::Quantize ) + { + // Quantize operations need double-precision scaling + QuantizedScale quantScale(ifmScale / ofmScale); + scale = uint32_t(quantScale.scale); + shift = quantScale.shift; + } + else if ( isNoOp ) + { + QuantizedScale quantScale(float(ifmScale) / float(ofmScale)); + scale = uint32_t(quantScale.scale); + shift = quantScale.shift; + } + else + { + // Normal pooling operation, without need for special scaling + double rescale = ifmScale / ofmScale; + QuantizePoolingScale(op->kernel.ElementsWH(), rescale, 0, scale, shift, 31); + } + } + ofmQuant->scales.clear(); + ofmQuant->scales.push_back({int32_t(scale), shift}); + ofmQuant->type = QuantizationType::EXPLICIT; +} + +Quantization RescalePerChannel(const Quantization &ifmQuant, const Quantization &weightQuant, + const Quantization &ofmQuant, const DataType scaleDataType, const DataType ifmDataType) +{ + if ( ofmQuant.type != QuantizationType::TFLITE ) + { + // Explicit quantized scale has already been set + return ofmQuant; + } + + Quantization quantResult; + quantResult.type = QuantizationType::EXPLICIT; + quantResult.zeroPoints = ofmQuant.zeroPoints; + quantResult.quantMin = ofmQuant.quantMin; + quantResult.quantMax = ofmQuant.quantMax; + quantResult.dimension = ofmQuant.dimension; + quantResult.forceZeroPoint = ofmQuant.forceZeroPoint; + + if ( !ifmQuant.scales.empty() && !ofmQuant.scales.empty() && !weightQuant.scales.empty() ) + { + DataType dataType = DataType::None; + bool reducedScale = false; + if ( scaleDataType == DataType::Int32 ) + { + switch ( ifmDataType ) + { + case DataType::Int8: + case DataType::UInt8: + case DataType::Int16: + dataType = ifmDataType; + break; + default: + break; + } + } + else if ( scaleDataType == DataType::Int64 && DataTypeSizeBits(ifmDataType) == 16 ) + { + dataType = DataType::Int16; + reducedScale = true; + } + + int modIfm = (ifmQuant.scales.size()) == 1 ? 0 : -1; + int modOfm = (ofmQuant.scales.size()) == 1 ? 0 : -1; + + quantResult.scales.reserve(weightQuant.scales.size()); + + for ( int i = 0; i < int(weightQuant.scales.size()); i++ ) + { + double v = 1.0; + float ifmScale = float(ifmQuant.scales[i & modIfm].Dequantize()); + float ofmScale = float(ofmQuant.scales[i & modOfm].Dequantize()); + float weightScale = float(weightQuant.scales[i].Dequantize()); + if ( dataType == DataType::UInt8 ) + { + v = double(ifmScale * weightScale) / double(ofmScale); + } + else if ( dataType == DataType::Int8 || dataType == DataType::Int16 ) + { + v = (double(ifmScale) * double(weightScale)) / double(ofmScale); + } + + quantResult.scales.emplace_back(v, reducedScale); + } + } + + return quantResult; +} + +} // namespace regor::ethosU85Scaling diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.hpp new file mode 100644 index 00000000..aa8bed53 --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_scaling.hpp @@ -0,0 +1,38 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once +#include "common/data_type.hpp" + +namespace regor +{ + +class Quantization; +struct HLCOperation; + +namespace ethosU85Scaling +{ + +void RescalePooling(HLCOperation *op, bool isNoOp); +void RescaleConvolution(HLCOperation *op); +void RescaleElementwise(HLCOperation *op); +Quantization RescalePerChannel(const Quantization &ifmQuant, const Quantization &weightQuant, + const Quantization &ofmQuant, const DataType scaleDataType, const DataType ifmDataType); + +} // namespace ethosU85Scaling +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.cpp b/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.cpp new file mode 100644 index 00000000..ccc8ed6d --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.cpp @@ -0,0 +1,775 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "ethos_u85_weight_encoder.hpp" + +#include "common/logging.hpp" + +#include "architecture/architecture.hpp" +#include "architecture/ethos_u_scaling.hpp" +#include "architecture/mlw_encode.hpp" +#include "common/buffer_view.hpp" +#include "common/shape.hpp" +#include "compiler/tensor_properties.hpp" +#include "ethos_u85.hpp" +#include "ethos_u85_scaling.hpp" + +#include +#include +#include + + +namespace regor +{ + + +EthosU85WeightEncoder::EthosUEncodingConfig::EthosUEncodingConfig(int cores, Flags weightFormat) : + _cores(cores), _weightFormat(weightFormat) +{ +} + +void EthosU85WeightEncoder::EthosUEncodingConfig::Rehash() +{ + _hash = SimpleHash32(ofmBlockDepth, traversal, _depthOffsetHash, ifmType, dilation, ohwiStrides); + + _depthOffsetHash = 0; + for ( int offset : this->depthOffsets ) + { + _depthOffsetHash = _depthOffsetHash * 31 ^ offset; + } +} + +uint32_t EthosU85WeightEncoder::EthosUEncodingConfig::Hash() +{ + return _hash; +} + +bool EthosU85WeightEncoder::EthosUEncodingConfig::Equals(IWeightEncodingConfig *other) +{ + EthosUEncodingConfig *p = static_cast(other); + return std::tie(ofmBlockDepth, ifmBlockDepth, traversal, _depthOffsetHash, ifmType, dilation, ohwiStrides, _weightFormat) == + std::tie(p->ofmBlockDepth, p->ifmBlockDepth, p->traversal, p->_depthOffsetHash, p->ifmType, p->dilation, p->ohwiStrides, p->_weightFormat); +} + +const std::vector &EthosU85WeightEncoder::EthosUEncodingConfig::DepthOffsets() +{ + return this->depthOffsets; +} + +Flags EthosU85WeightEncoder::EthosUEncodingConfig::Format() +{ + return _weightFormat; +} + + +std::unique_ptr EthosU85WeightEncoder::GetEncodingConfig(ArchitectureOpConfig *opCfg, const WeightsRef &weights, + const Kernel *kernel, DataType ifmType, const std::vector &depthOffsets, Flags format) +{ + std::unique_ptr params = std::make_unique(_arch->_cores, format); + + EthosU85OpConfig *opConfig = static_cast(opCfg); + params->ofmUBlock = opConfig->OfmUBlock(); + params->ofmBlockDepth = opConfig->OfmBlock().Depth(); + params->ifmBlockDepth = opConfig->IfmBlock().Depth(); + params->traversal = opConfig->Traversal(); + params->acc = opConfig->Acc(); + params->depthOffsets = depthOffsets; + params->ifmType = ifmType; + params->dilation = kernel->Dilation(); + params->stride = kernel->Stride(); + + assert(!weights.isScales); + Shape ohwiStrides = weights.view->StrideBytes() * 8 / DataTypeSizeBits(weights.type); + if ( weights.axisOrder == AxisOrder::IHWO ) + { + ohwiStrides = ohwiStrides.Extract(3, 1, 2, 0); + } + params->ohwiStrides = std::move(ohwiStrides); + params->Rehash(); + + return params; +} + +int EthosU85WeightEncoder::StreamsRequired(IWeightEncodingConfig *config, const Shape & /*weightShape*/, int &scaleStreamsRequired) +{ + scaleStreamsRequired = 1; + return config->Format() & WeightFormat::Fast ? 1 : _arch->_cores; +} + +static int EncodeBias32(int64_t bias, int32_t scale, int shift, uint8_t data[10]) +{ + assert(-(1LL << (32 - 1)) <= bias && bias < (1LL << (32 - 1))); // signed 32-bit range + assert(0 <= scale); // unsigned 31-bit range + assert(0 <= shift && shift < (1 << 6)); // unsigned 6-bit range + + data[0] = uint8_t((bias >> (0 * 8)) & 0xFF); + data[1] = uint8_t((bias >> (1 * 8)) & 0xFF); + data[2] = uint8_t((bias >> (2 * 8)) & 0xFF); + data[3] = uint8_t((bias >> (3 * 8)) & 0xFF); + data[4] = uint8_t((scale >> (0 * 8)) & 0xFF); + data[5] = uint8_t((scale >> (1 * 8)) & 0xFF); + data[6] = uint8_t((scale >> (2 * 8)) & 0xFF); + data[7] = uint8_t((scale >> (3 * 8)) & 0x7F); + data[8] = uint8_t(shift & 0x3F); + data[9] = 0; + return 10; +} + +static int EncodeBias48(int64_t bias, int32_t scale, int shift, uint8_t data[10]) +{ + assert(-(1LL << (48 - 1)) <= bias && bias < (1LL << (48 - 1))); // signed 48-bit range + assert(0 <= scale && scale < (1L << 15)); // unsigned 15-bit range + assert(0 <= shift && shift < (1 << 6)); // unsigned 6-bit range + + data[0] = uint8_t((bias >> (0 * 8)) & 0xFF); + data[1] = uint8_t((bias >> (1 * 8)) & 0xFF); + data[2] = uint8_t((bias >> (2 * 8)) & 0xFF); + data[3] = uint8_t((bias >> (3 * 8)) & 0xFF); + data[4] = uint8_t((bias >> (4 * 8)) & 0xFF); + data[5] = uint8_t((bias >> (5 * 8)) & 0xFF); + data[6] = uint8_t((scale >> (0 * 8)) & 0xFF); + data[7] = uint8_t((scale >> (1 * 8)) & 0x7F); + data[8] = uint8_t(shift & 0x3F); + data[9] = 0; + return 10; +} + +struct SparsityTracker +{ + int _sparse_zeroes = 4; + int _sparse_index = 0; + uint32_t _sparse_pos = 0xFFFFFFFF; + void Reset() { _sparse_pos = 0xFFFFFFFF; } + + void Check(uint32_t pos, int depth, int weight) + { + if ( _sparse_pos != pos ) + { + _sparse_pos = pos; + _sparse_zeroes = 0; + _sparse_index = 0; + if ( depth & 3 ) throw WeightsNotSparse(); + } + + if ( weight == 0 ) _sparse_zeroes++; + else if ( weight > 127 || weight < -127 ) throw WeightsNotSparse(); + + if ( (_sparse_index & 3) == 3 ) + { + if ( _sparse_zeroes < 2 ) throw WeightsNotSparse(); + _sparse_zeroes = 0; + } + + _sparse_index++; + } +}; + +template +class EthosU85WeightOrdering : public WeightSourceCommon +{ +protected: + static constexpr int InterleaveDepth = 4; + // Transform + WeightTransformParam *_param; + WeightTransformFunc _transform; + // Loop Limits + Point2i _stride; + int _ofmBlockDepth; + int _ifmBlockDepth; + short _ofmUBlockDepth; + short _ifmUBlockDepth; + short _decompX; + short _decompY; + short _subKernelRound; + short _dwPaddingCount; + // Saved state + int _ofmBlockZ = 0; + int _ifmBlockZ = 0; + int _subKernelX = 0; + int _subKernelY = 0; + int _ifmUBlockOuter = 0; + int _ifmUBlockInner = 0; + int _ofmUBlockZ = 0; + int _ifmUBlockZ = 0; + int _subKernelElements = 0; + int _strideX = 0; + int _strideY = 0; + int _kernelX = 0; + int _kernelY = 0; + int _ofmUBlockInner = 0; + int _ofmUBlockOuter = 0; + int _ifmLoopInc = 0; + int _padding = 0; + EthosU85Traversal _traversal; + bool _sparse; + SparsityTracker _sparsity; + +public: + EthosU85WeightOrdering(int cores, int macs, Point2i stride, const Point2i &dilation, int ofmBlockDepth, int ifmBlockDepth, int ifmBitDepth, + int ofmUBlockDepth, WeightTransformFunc func, WeightTransformParam *param, EthosU85Traversal traversal, bool sparse) + { + const bool ifm16bit = (ifmBitDepth == 16); + _streams = cores; + _transform = func; + _param = param; + _traversal = traversal; + _sparse = sparse; + _stride = stride; + + _ofmBlockDepth = ofmBlockDepth; + _ifmBlockDepth = ifmBlockDepth; + _ofmUBlockDepth = short(ofmUBlockDepth); + + if ( traversal == EthosU85Traversal::PartKernel ) + { + _subKernelRound = (ifm16bit || sparse) ? 10 : 5; + _ifmUBlockDepth = ifm16bit && !sparse ? 8 : _ifmBlockDepth; + } + else + { + if ( traversal == EthosU85Traversal::DepthFirst ) + { + _stride = Point2i(1, 1); + _subKernelRound = 1; + _ifmUBlockDepth = _ifmBlockDepth; + } + else if ( traversal == EthosU85Traversal::Depthwise ) + { + _subKernelRound = 10; + _ifmUBlockDepth = 1; + } + } + + _decompX = short(8 / dilation.x); + _decompY = short(8 / dilation.y); + _dwPaddingCount = (!ifm16bit && macs <= 256) ? 0 : (macs <= 512) ? 2 : 6; + + _ifmLoopInc = -_ifmBlockDepth; + } + + void SetSource(const void *buffer, int depthOffset, const Shape &ohwiShape, const Shape &ohwiStrides, int streamIndex) override + { + SetSourceCommon(buffer, depthOffset, ohwiShape, ohwiStrides, streamIndex, false); + assert(_streamIndex == streamIndex); + _ofmUBlockZ = _streamIndex * InterleaveDepth; + _sparsity.Reset(); + } + +public: + int Get(int16_t *output, int count) override + { + if ( _traversal == EthosU85Traversal::Depthwise ) + { + assert(!_sparse); + return GetNext(output, count); + } + else if ( _sparse ) + { + return GetNext(output, count); + } + + return GetNext(output, count); + } + + template + int GetNext(int16_t *output, int count) + { + if ( _ofmBlockZ >= _ofmDepth ) + { + return 0; + } + + int ofmBlockZ, ifmBlockZ; + int ifmUBlockOuter, ifmUBlockInner; + int ifmUBlockZ, ofmUBlockZ, ofmUBlockInner, ofmUBlockOuter; + int subKernelX, subKernelY; + int strideX, strideY; + int kernelX, kernelY; + int padding; + int16_t *write = output; + + const TYPE *buffer = reinterpret_cast(_source); + + for ( ofmBlockZ = _ofmBlockZ; ofmBlockZ < _ofmDepth; ofmBlockZ += _ofmBlockDepth ) + { + _ifmLoopInc = -_ifmLoopInc; + int clippedOfmBlockDepth = std::min(_ofmBlockDepth, _ofmDepth - ofmBlockZ); + // IFM blocks required for the brick + for ( ifmBlockZ = _ifmBlockZ; ifmBlockZ < (IS_DEPTHWISE ? 1 : _ifmDepth) && ifmBlockZ >= 0; ifmBlockZ += _ifmLoopInc ) + { + _ifmBlockZ = ifmBlockZ; + int clippedIfmBlockDepth = std::min(_ifmBlockDepth, _ifmDepth - ifmBlockZ); + + // Weight decomposition + // Subkernel splitting (W) + for ( subKernelX = _subKernelX; subKernelX < _kernelW; subKernelX += _decompX ) + { + int subWidth = std::min(_kernelW - subKernelX, _decompX); + // Subkernel Splitting (H) + for ( subKernelY = _subKernelY; subKernelY < _kernelH; subKernelY += _decompY ) + { + int subHeight = std::min(_kernelH - subKernelY, _decompY); + int ifmBlockDepthOuter = IS_DEPTHWISE ? 1 : clippedIfmBlockDepth; + for ( ifmUBlockOuter = _ifmUBlockOuter; ifmUBlockOuter < ifmBlockDepthOuter; ifmUBlockOuter += _ifmUBlockDepth ) + { + // OFM uBlocks in OFM-block over depth + for ( ofmUBlockOuter = _ofmUBlockOuter; ofmUBlockOuter < clippedOfmBlockDepth; ofmUBlockOuter += _ofmUBlockDepth ) + { + // Part kernel first works across the kernel H/W and needs padding + if ( !_subKernelElements ) + { + int subKernelElements = subWidth * subHeight; + _subKernelElements = RoundAway(subKernelElements, _subKernelRound); + } + for ( strideY = _strideY; strideY < _stride.y; ++strideY ) + { + int stridedKernelH = (subHeight + _stride.y - 1 - strideY) / _stride.y; + for ( strideX = _strideX; strideX < _stride.x; ++strideX ) + { + int stridedKernelW = (subWidth + _stride.x - 1 - strideX) / _stride.x; + for ( kernelY = _kernelY; kernelY < stridedKernelH; ++kernelY ) + { + int y = kernelY; + for ( kernelX = _kernelX; kernelX < stridedKernelW; ++kernelX ) + { + int x = kernelY % 2 == 0 ? kernelX : stridedKernelW - 1 - kernelX; + _subKernelElements--; + int ifmUBlockInnerStep = IS_DEPTHWISE ? 1 : (IS_SPARSE ? 16 : 8); + for ( ifmUBlockInner = _ifmUBlockInner; ifmUBlockInner < _ifmUBlockDepth; ifmUBlockInner += ifmUBlockInnerStep ) + { + // Feed OFM uBlock elements + for ( ofmUBlockZ = _ofmUBlockZ; ofmUBlockZ < _ofmUBlockDepth; ofmUBlockZ += InterleaveDepth * _streams ) + { + for ( ofmUBlockInner = _ofmUBlockInner; ofmUBlockInner < InterleaveDepth; ofmUBlockInner++ ) + { + // Source IFM uBlock elements (only 1 element deep if + // depthwise) + for ( ifmUBlockZ = _ifmUBlockZ; ifmUBlockZ < ifmUBlockInnerStep; ifmUBlockZ++ ) + { + // Source position within the current subkernel + int wx = subKernelX + strideX + x * _stride.x; + int wy = subKernelY + strideY + y * _stride.y; + // Source IFM/OFM slices + int ifm_z = ifmBlockZ + ifmUBlockOuter + ifmUBlockInner + ifmUBlockZ; + int ofm_z = ofmBlockZ + ofmUBlockOuter + ofmUBlockInner + ofmUBlockZ; + int weight = 0; + if ( ifm_z < _ifmDepth && ofm_z < _ofmDepth ) + { + _param->o = ofm_z; + _param->h = wy; + _param->w = wx; + _param->i = ifm_z; + weight = int(buffer[WeightIndex(ofm_z, wy, wx, ifm_z)]); + weight = _transform(_param, weight); + } + + if constexpr ( IS_SPARSE ) + _sparsity.Check((unsigned(wy) << 16) | wx, ifm_z, weight); + + *write++ = int16_t(weight); + + if ( --count == 0 ) + { + // Save state + _subKernelElements++; + _ifmUBlockZ = ifmUBlockZ + 1; + _ofmUBlockInner = ofmUBlockInner; + _ofmUBlockZ = ofmUBlockZ; + _ifmUBlockInner = ifmUBlockInner; + _kernelX = kernelX; + _kernelY = kernelY; + _strideX = strideX; + _strideY = strideY; + _ofmUBlockOuter = ofmUBlockOuter; + _ifmUBlockOuter = ifmUBlockOuter; + _subKernelY = subKernelY; + _subKernelX = subKernelX; + _ofmBlockZ = ofmBlockZ; + _ifmLoopInc = -_ifmLoopInc; + return int(intptr_t(write - output)); + } + } + _ifmUBlockZ = 0; + } + _ofmUBlockInner = 0; + } + _ofmUBlockZ = _streamIndex * InterleaveDepth; + } + // Depthwise padding + if ( IS_DEPTHWISE && _subKernelElements % _subKernelRound == 0 ) + { + int padCount = _dwPaddingCount * _ofmUBlockDepth / _streams; + for ( padding = _padding; padding < padCount; padding++ ) + { + *write++ = 0; + if ( --count == 0 ) + { + // Save state + _subKernelElements++; + _padding = padding + 1; + _ifmUBlockInner = ifmUBlockInner; // Will skip loop above + _kernelX = kernelX; + _kernelY = kernelY; + _strideX = strideX; + _strideY = strideY; + _ofmUBlockOuter = ofmUBlockOuter; + _ifmUBlockOuter = ifmUBlockOuter; + _subKernelY = subKernelY; + _subKernelX = subKernelX; + _ofmBlockZ = ofmBlockZ; + _ifmLoopInc = -_ifmLoopInc; + return int(intptr_t(write - output)); + } + } + _padding = 0; + } + _ifmUBlockInner = 0; + } + _kernelX = 0; + } + _kernelY = 0; + } + _strideX = 0; + } + // Padding + if ( _subKernelElements > 0 ) + { + int padCount = _subKernelElements + (IS_DEPTHWISE ? _dwPaddingCount : 0); + padCount = padCount * _ifmUBlockDepth * _ofmUBlockDepth / _streams; + for ( padding = _padding; padding < padCount; padding++ ) + { + *write++ = 0; + if ( --count == 0 ) + { + // Save state + _padding = padding + 1; + _strideY = strideY; // Will skip loop above + _ofmUBlockOuter = ofmUBlockOuter; + _ifmUBlockOuter = ifmUBlockOuter; + _subKernelY = subKernelY; + _subKernelX = subKernelX; + _ofmBlockZ = ofmBlockZ; + _ifmLoopInc = -_ifmLoopInc; + return int(intptr_t(write - output)); + } + } + _padding = 0; + } + _subKernelElements = 0; + _strideY = 0; + } + _ofmUBlockOuter = 0; + } + _ifmUBlockOuter = 0; + } + _subKernelY = 0; + } + _subKernelX = 0; + } + } + _ifmLoopInc = -_ifmBlockDepth; + _ifmBlockZ = 0; + _ofmBlockZ = 0; + // Return weights generated (less than requested count == EOS) + return int(intptr_t(write - output)); + } +}; + + +template +class EthosU85WeightOrderingFwd : public WeightSourceCommon +{ +protected: + int _weightBlockSize; + int _blockSizeEmitted; + EthosU85Traversal _traversal; + std::vector>> _weightSource; + +public: + EthosU85WeightOrderingFwd(int cores, int macs, Point2i stride, const Point2i &dilation, int ofmBlockDepth, int ifmBlockDepth, + int ifmBitDepth, int ofmUBlockDepth, WeightTransformFunc func, WeightTransformParam *param, EthosU85Traversal traversal, bool sparse) + { + assert(traversal != EthosU85Traversal::Depthwise); + const bool isPartKernel = (traversal == EthosU85Traversal::PartKernel); + const bool ifm16bit = ifmBitDepth == 16; + + const int ifmUBlockDepth = isPartKernel && ifm16bit && !sparse ? 8 : ifmBlockDepth; + const int subKernelRound = isPartKernel ? (!ifm16bit && !sparse ? 5 : 10) : 1; + _streams = cores; + _weightBlockSize = ofmUBlockDepth * ifmUBlockDepth * subKernelRound / _streams; + _blockSizeEmitted = 0; + _traversal = traversal; + for ( int stream = 0; stream < _streams; ++stream ) + { + _weightSource.push_back(std::make_unique>(cores, macs, stride, dilation, + ofmBlockDepth, ifmBlockDepth, ifmBitDepth, ofmUBlockDepth, func, param, traversal, sparse)); + } + } + + void SetSource(const void *buffer, int depthOffset, const Shape &ohwiShape, const Shape &ohwiStrides, int streamIndex) override + { + SetSourceCommon(buffer, depthOffset, ohwiShape, ohwiStrides, streamIndex, false); + _streamIndex = streamIndex; + for ( int stream = 0; stream < _streams; ++stream ) + { + _weightSource[stream]->SetSource(buffer, depthOffset, ohwiShape, ohwiStrides, stream); + } + } + +public: + int Get(int16_t *output, int count) override + { + // Interleave weight sources + int offset = 0; + int result = _weightBlockSize; + int stream; + while ( offset < count && result ) + { + for ( stream = _streamIndex; stream < _streams; ++stream ) + { + int blockSize = std::min(_weightBlockSize - _blockSizeEmitted, count - offset); + result = _weightSource[stream]->Get(output + offset, blockSize); + offset += result; + if ( offset == count ) + { // Output filled + if ( result < _weightBlockSize ) + { // Incomplete block, save state + _blockSizeEmitted = result; + _streamIndex = stream; + } + else + { // Complete block, start on next + _blockSizeEmitted = 0; + _streamIndex = stream + 1; + } + return offset; + } + _blockSizeEmitted = 0; + } + _streamIndex = 0; + } + // Return weights generated (less than requested count == EOS) + return offset; + } +}; + + +std::unique_ptr EthosU85WeightEncoder::GetWeightSource( + IWeightEncodingConfig *config, DataType weightType, WeightTransformFunc func, WeightTransformParam *param) +{ + EthosUEncodingConfig *cfg = static_cast(config); + + int ofmUBlockDepth = cfg->ofmUBlock.Depth(); + + int ifmBitDepth = DataTypeSizeBits(cfg->ifmType); + bool isFast = cfg->Format() & WeightFormat::Fast; + bool isSparse = cfg->Format() & WeightFormat::Sparse2_4; + + if ( weightType == DataType::UInt8 ) + { + if ( isFast && _arch->_cores > 1 ) // No interleaving needed for FWD if only one stream + { + return std::make_unique>(_arch->_cores, _arch->_macs, cfg->stride, cfg->dilation, + cfg->ofmBlockDepth, cfg->ifmBlockDepth, ifmBitDepth, ofmUBlockDepth, func, param, cfg->traversal, isSparse); + } + else + { + assert(!(isFast && cfg->traversal == EthosU85Traversal::Depthwise)); + return std::make_unique>(_arch->_cores, _arch->_macs, cfg->stride, cfg->dilation, + cfg->ofmBlockDepth, cfg->ifmBlockDepth, ifmBitDepth, ofmUBlockDepth, func, param, cfg->traversal, isSparse); + } + } + else if ( weightType == DataType::Int8 ) + { + if ( isFast && _arch->_cores > 1 ) // No interleaving needed for FWD if only one stream + { + return std::make_unique>(_arch->_cores, _arch->_macs, cfg->stride, cfg->dilation, + cfg->ofmBlockDepth, cfg->ifmBlockDepth, ifmBitDepth, ofmUBlockDepth, func, param, cfg->traversal, isSparse); + } + else + { + assert(!(isFast && cfg->traversal == EthosU85Traversal::Depthwise)); + return std::make_unique>(_arch->_cores, _arch->_macs, cfg->stride, cfg->dilation, + cfg->ofmBlockDepth, cfg->ifmBlockDepth, ifmBitDepth, ofmUBlockDepth, func, param, cfg->traversal, isSparse); + } + } + + assert(false && "No weight source for this datatype"); + return nullptr; +} + + +template +class EthosU85ScaleSource : public IVolumeScaleSource +{ +private: + const TYPE *_buffer = nullptr; + const QuantizedScale *_scales = nullptr; + int _biasIndex = 0; + int _biasCount = 0; + int _bufferSize = 0; + int _streamIndex = 0; + int _cores = 0; + int _uBlockDepth = 0; + Quantization _quantization; + +public: + EthosU85ScaleSource(int cores, int uBlockDepth, Quantization quantization) : + _cores(cores), _uBlockDepth(uBlockDepth), _quantization(std::move(quantization)) + { + assert(!_quantization.scales.empty()); + auto invalidScale = std::find_if(std::begin(_quantization.scales), std::end(_quantization.scales), + [](const auto q) { return q.shift < 0 || q.shift >= 64; }); + assert(invalidScale == std::end(_quantization.scales)); + } + + int Elements() + { + assert(_biasCount >= 0); + return _biasCount; + } + + int Get(int64_t *biasBuffer, QuantizedScale *quantBuffer, int count) + { + count = std::min(count, _biasCount); + const size_t scaleSize = _quantization.scales.size(); + + for ( int i = 0; i < count; i++ ) + { + int index = _biasIndex + i; + if ( index < _bufferSize ) + { + *biasBuffer++ = _buffer[index]; + *quantBuffer++ = _quantization.scales[index % scaleSize]; + } + else + { + *biasBuffer++ = 0; + *quantBuffer++ = QuantizedScale(0, 0); + } + _biasCount--; + } + + _biasIndex += count; + return count; + } + + void SetSource(const void *buffer, int biasCount, int depthOffset, int depthLength, int streamIndex) + { + assert(streamIndex >= 0 && streamIndex < _cores); + assert(depthOffset + depthLength <= biasCount); + _buffer = reinterpret_cast(buffer); + _biasIndex = depthOffset + streamIndex; // Where to start in the buffer + _biasCount = RoundAway(depthLength, RoundAway(_uBlockDepth, 16)); // How many biases to generate + _bufferSize = biasCount; + } +}; + + +std::unique_ptr EthosU85WeightEncoder::GetScaleSource( + IWeightEncodingConfig *config, DataType scaleType, const Quantization &explicitQuant) +{ + EthosUEncodingConfig *cfg = static_cast(config); + assert(explicitQuant.type == QuantizationType::EXPLICIT); + + if ( scaleType == DataType::Int32 ) + { + if ( cfg->ifmType == DataType::Int8 ) + { + return std::make_unique>(_arch->_cores, cfg->ofmUBlock.Depth(), explicitQuant); + } + else if ( cfg->ifmType == DataType::UInt8 ) + { + return std::make_unique>(_arch->_cores, cfg->ofmUBlock.Depth(), explicitQuant); + } + else if ( cfg->ifmType == DataType::Int16 ) + { + return std::make_unique>(_arch->_cores, cfg->ofmUBlock.Depth(), explicitQuant); + } + } + else if ( scaleType == DataType::Int48 && DataTypeSizeBits(cfg->ifmType) == 16 ) + { + return std::make_unique>(_arch->_cores, cfg->ofmUBlock.Depth(), explicitQuant); + } + else if ( scaleType == DataType::Int64 && DataTypeSizeBits(cfg->ifmType) == 16 ) + { + return std::make_unique>(_arch->_cores, cfg->ofmUBlock.Depth(), explicitQuant); + } + + return nullptr; +} + +Quantization EthosU85WeightEncoder::MakeExplicit(const Quantization &ifmQ, const Quantization &weightQ, + const Quantization &ofmQ, DataType scaleType, DataType ifmType) +{ + if ( DataTypeSizeBits(ifmType) == 16 ) ifmType = DataType::Int16; + + return ethosU85Scaling::RescalePerChannel(ifmQ, weightQ, ofmQ, scaleType, ifmType); +} + + +WeightsInfo EthosU85WeightEncoder::EncodeWeights( + IWeightEncodingConfig *config, IWeightSource *source, std::vector &result, bool measureOnly) +{ + EthosUEncodingConfig *cfg = static_cast(config); + auto fn = (cfg->Format() & WeightFormat::Fast) ? mle_encode_fwd_proxy : mle_encode_proxy; + unsigned flags = measureOnly ? MLW_ENCODE_NO_BITSTREAM : MLW_ENCODE_FLAG_NONE; + if ( cfg->Format().All(WeightFormat::Fast, WeightFormat::Sparse2_4) ) flags |= MLW_ENCODE_NO_PALETTE_LUT; + auto res = fn(source, 128 * 1024, result, flags); + return {res.elements_read, res.bytes_written, res.zero_count}; +} + + +int EthosU85WeightEncoder::EncodeScales(IWeightEncodingConfig *config, IScaleSource *source, std::vector &result, bool measureOnly) +{ + EthosUEncodingConfig *cfg = static_cast(config); + + constexpr int BUFFER_SIZE = 8; + constexpr int SCALE_ELEMENT_SIZE = 10; + auto EncodeBias = cfg->acc == EthosU85Accumulator::Acc32 ? EncodeBias32 : EncodeBias48; + + if ( measureOnly ) + { + return source->Elements() * SCALE_ELEMENT_SIZE; // Must be accurate + } + + int64_t scaleBuffer[BUFFER_SIZE]; + QuantizedScale quantBuffer[BUFFER_SIZE]; + + int start = int(result.size()); + int write = start; + result.reserve(start + source->Elements() * SCALE_ELEMENT_SIZE); + while ( true ) + { + int count = source->Get(scaleBuffer, quantBuffer, BUFFER_SIZE); + result.resize(write + (count * SCALE_ELEMENT_SIZE)); + + for ( int i = 0; i < count; i++ ) + { + write += EncodeBias(scaleBuffer[i], quantBuffer[i].scale, quantBuffer[i].shift, &result[write]); + } + if ( count < BUFFER_SIZE ) + { + break; + } + } + + return write - start; +} + +} // namespace regor diff --git a/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.hpp b/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.hpp new file mode 100644 index 00000000..c6203363 --- /dev/null +++ b/ethosu/regor/architecture/ethosu85/ethos_u85_weight_encoder.hpp @@ -0,0 +1,91 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/architecture.hpp" +#include "architecture/ethos_u_scaling.hpp" +#include "architecture/ethosu85/ethos_u85.hpp" +#include "architecture/mlw_encode.hpp" +#include "architecture/weight_encoder.hpp" +#include "common/shape.hpp" + +namespace regor +{ + +/// +/// Encodes weights and biases. +/// +class EthosU85WeightEncoder : public WeightEncoder +{ +private: + struct EthosUEncodingConfig : IWeightEncodingConfig + { + private: + uint32_t _hash = 0; + uint32_t _depthOffsetHash = 0; + int _cores = 0; + Flags _weightFormat = WeightFormat::Default; + + public: + DataType ifmType = DataType::None; + int ofmBlockDepth = 0; + int ifmBlockDepth = 0; + EthosU85Traversal traversal = EthosU85Traversal::DepthFirst; + EthosU85Accumulator acc = EthosU85Accumulator::Acc32; + std::vector depthOffsets; + Point2i dilation; + Point2i stride; + Shape ohwiStrides; + Shape ofmUBlock; + + public: + EthosUEncodingConfig(int cores, Flags weightFormat); + void Rehash(); + uint32_t Hash() override; + bool Equals(IWeightEncodingConfig *other) override; + const std::vector &DepthOffsets() override; + Flags Format() override; + }; + +public: + EthosU85WeightEncoder(ArchEthosU85 *arch) : _arch(arch) {} + +public: + std::unique_ptr GetEncodingConfig(ArchitectureOpConfig *opCfg, const WeightsRef &weights, + const Kernel *kernel, DataType ifmType, const std::vector &depthOffsets, Flags format); + + int StreamsRequired(IWeightEncodingConfig *config, const Shape &weightShape, int &scaleStreamsRequired); + + std::unique_ptr GetWeightSource( + IWeightEncodingConfig *config, DataType weightType, WeightTransformFunc func, WeightTransformParam *param); + + std::unique_ptr GetScaleSource(IWeightEncodingConfig *config, DataType scaleType, const Quantization &explicitQuant); + + Quantization MakeExplicit(const Quantization &ifmQ, const Quantization &weightQ, const Quantization &ofmQ, + DataType scaleType, DataType ifmType); + + WeightsInfo EncodeWeights(IWeightEncodingConfig *config, IWeightSource *source, std::vector &result, bool measureOnly); + + int EncodeScales(IWeightEncodingConfig *config, IScaleSource *source, std::vector &result, bool measureOnly); + +private: + ArchEthosU85 *_arch; +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/mlw_encode.cpp b/ethosu/regor/architecture/mlw_encode.cpp new file mode 100644 index 00000000..0b5f48f8 --- /dev/null +++ b/ethosu/regor/architecture/mlw_encode.cpp @@ -0,0 +1,125 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "mlw_encode.hpp" + +#include "common/common.hpp" + +#include "common/bit_flags.hpp" + +#include +#include + +BEGIN_ENUM_TABLE(WeightFormat) + ADD_ENUM_NAME(Default) + ADD_ENUM_NAME(Fast) + ADD_ENUM_NAME(Sparse2_4) +END_ENUM_TABLE() + +thread_local static std::vector *sResult = nullptr; + +static void *reallocFunc(void *ptr, size_t reserve, int purpose) +{ + UNUSED(purpose); + assert(sResult); + assert(purpose == MLW_ENCODE_ALLOC_STREAM0); + size_t offset = ptr ? static_cast(ptr) - sResult->data() : sResult->size(); + sResult->resize(reserve + offset); + return reserve ? static_cast(sResult->data() + offset) : nullptr; +} + +MlwEncodeResult mle_encode_proxy(IWeightSource *source, int chunkSize, std::vector &output, unsigned encodeFlags) +{ + assert(sResult == nullptr); + sResult = &output; + auto output_size = output.size(); + ml_encode_result_t res; + int zeroCount = 0; + ml_ethosu_encode_params_t params; + params.encoder_flags = encodeFlags; + params.source_buffering_hint = chunkSize; + params.realloc_func = reallocFunc; + + auto weight_func = [](int32_t query, ml_source_state_t *state, int16_t *buffer, int32_t size, void *user_arg) + { + UNUSED(query); + assert(query == MLW_SOURCE_QUERY_WEIGHTS); + IWeightSource *src = reinterpret_cast(user_arg); + int source_size = src->Get(buffer, size); + state->eos = source_size < size; + return source_size; + }; + + try + { + mle_context_t *ctx = nullptr; + auto ret = ml_encode_ethosu_stream(&res, ¶ms, weight_func, source, &ctx); + if ( ret < 0 ) throw std::runtime_error("mlw encode failed"); + zeroCount = mle_context_query_zeroes(ctx); + mle_destroy_context(ctx); + } + catch ( const std::runtime_error & ) + { + sResult = nullptr; + throw; + } + sResult = nullptr; + res.encoded_data = nullptr; // Data owned by output + mle_free(&res); + output.resize(output_size + res.encoded_length); + return {res.source_length, res.encoded_length, zeroCount}; +} + +MlwEncodeResult mle_encode_fwd_proxy(IWeightSource *source, int chunkSize, std::vector &output, unsigned encodeFlags) +{ + assert(sResult == nullptr); + sResult = &output; + auto output_size = output.size(); + int count = 0; + int totalCount = 0; + int zeroCount = 0; + std::vector weights; + + try + { + do + { + weights.resize(weights.size() + chunkSize); + count = source->Get(weights.data() + totalCount, chunkSize); + totalCount += count; + } while ( count == chunkSize ); + weights.resize(totalCount); + } + catch ( const std::runtime_error & ) + { + sResult = nullptr; + throw; + } + + ml_encode_result_t res; + mle_context_t *ctx = mle_create_context(MLW_ENCODE_SYNTAX_ETHOSU_FWD); + mle_context_set_allocator(ctx, reallocFunc); + mle_encode(ctx, &res, weights.data(), totalCount, encodeFlags); + zeroCount = mle_context_query_zeroes(ctx); + res.encoded_data = nullptr; // Data owned by output + mle_destroy_context(ctx); + mle_free(&res); + output.resize(output_size + res.encoded_length); + sResult = nullptr; + return {totalCount, res.encoded_length, zeroCount}; +} diff --git a/ethosu/regor/architecture/mlw_encode.hpp b/ethosu/regor/architecture/mlw_encode.hpp new file mode 100644 index 00000000..19d04544 --- /dev/null +++ b/ethosu/regor/architecture/mlw_encode.hpp @@ -0,0 +1,54 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include +#include + +enum class WeightFormat : uint16_t +{ + Default = 0, + Fast = 1, + Sparse2_4 = 2 +}; + +inline constexpr bool operator&(WeightFormat type, WeightFormat mask) +{ + return bool(unsigned(type) & unsigned(mask)); +} + +struct MlwEncodeResult +{ + int elements_read; + int bytes_written; + int zero_count; +}; + +class IWeightSource; + +class IWeightSource +{ +public: + virtual ~IWeightSource() = default; + virtual int Elements() = 0; + virtual int Get(int16_t *buffer, int count) = 0; +}; + +MlwEncodeResult mle_encode_proxy(IWeightSource *source, int chunkSize, std::vector &output, unsigned encodeFlags); +MlwEncodeResult mle_encode_fwd_proxy(IWeightSource *source, int chunkSize, std::vector &output, unsigned encodeFlags); diff --git a/ethosu/regor/architecture/register_command_stream_generator.hpp b/ethosu/regor/architecture/register_command_stream_generator.hpp new file mode 100644 index 00000000..5f4dc1d5 --- /dev/null +++ b/ethosu/regor/architecture/register_command_stream_generator.hpp @@ -0,0 +1,41 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "compiler/high_level_command_stream.hpp" + +#include +#include +#include +#include +#include + +namespace regor +{ + +class IRegisterCommandStreamGenerator +{ +public: + virtual ~IRegisterCommandStreamGenerator() = default; + virtual std::vector GenerateCommandStream(std::vector> &highLevelCommandStream, + std::vector> *genRanges, bool verbose) = 0; + virtual void PrintCommandStream(const std::vector &stream, std::vector> &debugInfo) = 0; +}; + +} // namespace regor diff --git a/ethosu/regor/architecture/weight_encoder.hpp b/ethosu/regor/architecture/weight_encoder.hpp new file mode 100644 index 00000000..bf217b6c --- /dev/null +++ b/ethosu/regor/architecture/weight_encoder.hpp @@ -0,0 +1,210 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/scaling.hpp" +#include "compiler/scheduler_operation.hpp" +#include "mlw_encode.hpp" + +#include +#include +#include + +namespace regor +{ + +/// +/// Contains info about encoded weights/scales for one core/depth offset combination +/// +struct WeightRange +{ + int offset = 0; + int scaleBytes = 0; + int weightOffset = 0; + int weightBytes = 0; + int index = 0; + + int TotalBytes() const { return scaleBytes + weightBytes; } +}; + +/// Produces key for indexing WeightTensor::encodedRanges +constexpr inline int WeightKey(int stream, int depth) +{ + return (depth << 8) + stream; +} + +struct WeightTransformParam +{ + int o, h, w, i; +}; + +typedef int (*WeightTransformFunc)(const WeightTransformParam *param, int weight); + +struct IWeightEncodingConfig +{ + virtual ~IWeightEncodingConfig() = default; + virtual uint32_t Hash() = 0; + virtual bool Equals(IWeightEncodingConfig *other) = 0; + virtual const std::vector &DepthOffsets() = 0; + virtual Flags Format() = 0; +}; + +struct IVolumeWeightSource : public IWeightSource +{ + virtual ~IVolumeWeightSource() = default; + virtual void SetSource(const void *buffer, int depthOffset, const Shape &ohwiShape, const Shape &ohwiStrides, int streamIndex) = 0; +}; + +struct IScaleSource +{ + virtual ~IScaleSource() = default; + virtual int Elements() = 0; + virtual int Get(int64_t *biasBuffer, QuantizedScale *quantBuffer, int count) = 0; +}; + +struct IVolumeScaleSource : public IScaleSource +{ + virtual ~IVolumeScaleSource() = default; + virtual void SetSource(const void *buffer, int biasCount, int depthOffset, int depthLength, int streamIndex) = 0; +}; + + +/// +/// Contains encoded weights and biases. +/// +class NpuWeightTensor : public SchedulerTensor +{ +public: + virtual ~NpuWeightTensor() = default; + + /** Required size in bytes if double buffering is applied */ + int maxRangeBytes = 0; + int totalWeightBytes = 0; + int subStreams = 0; + std::unordered_map encodedRanges; + std::unique_ptr config; +}; + +struct WeightScaleTensors +{ + /** Encoded weights, may be null */ + std::shared_ptr npuWeightsTensor; + /** Combined scaling parameters in the weights tensor **/ + uint32_t scaleHash; + /** Encoded scales, may be null */ + std::shared_ptr npuScalesTensor; +}; + + +struct WeightsRef +{ + BufferView *view = nullptr; + AxisOrder axisOrder = AxisOrder::Unknown; + DataType type = DataType::None; + bool isScales = false; +}; + +struct WeightsInfo +{ + int sourceSize = 0; + int encodedSize = 0; + int zeroCount = 0; + int streams = 0; +}; + +/// +/// Encodes weights and biases. +/// +class WeightEncoder +{ +public: + virtual ~WeightEncoder() = default; + + virtual std::unique_ptr GetEncodingConfig(ArchitectureOpConfig *opCfg, const WeightsRef &weights, + const Kernel *kernel, DataType ifmType, const std::vector &depthOffsets, Flags format) = 0; + + virtual int StreamsRequired(IWeightEncodingConfig *config, const Shape &ofmShape, int &scaleStreamsRequired) = 0; + + virtual std::unique_ptr GetWeightSource( + IWeightEncodingConfig *config, DataType weightType, WeightTransformFunc func, WeightTransformParam *param) = 0; + + virtual std::unique_ptr + GetScaleSource(IWeightEncodingConfig *config, DataType scaleType, const Quantization &explicitQuant) = 0; + + virtual Quantization MakeExplicit(const Quantization &ifmQ, const Quantization &weightQ, const Quantization &ofmQ, + DataType scaleType, DataType ifmType) = 0; + + virtual WeightsInfo EncodeWeights( + IWeightEncodingConfig *config, IWeightSource *source, std::vector &result, bool measureOnly) = 0; + virtual int EncodeScales(IWeightEncodingConfig *config, IScaleSource *source, std::vector &result, bool measureOnly) = 0; +}; + +// IVolumeWeightSource common implementation +class WeightSourceCommon : public IVolumeWeightSource +{ + +protected: + const void *_source; + int16_t _streams = 1; + int16_t _streamIndex = 0; + int _ofmDepth = 0; + int _ifmDepth = 0; + int _kernelH = 0; + int _kernelW = 0; + int _ohwiStrides[4]; + +protected: + void SetSourceCommon(const void *buffer, int depthOffset, const Shape &ohwiShape, const Shape &ohwiStrides, int streamIndex, bool separated) + { + assert(streamIndex < _streams); + _streamIndex = streamIndex; + + int streamOffset = Shape(depthOffset, 0, 0, 0).Dot(ohwiStrides); + _source = reinterpret_cast(buffer) + streamOffset; + _ifmDepth = ohwiShape[-1]; + _ofmDepth = separated ? (ohwiShape[0] + _streams - 1 - streamIndex) / _streams : ohwiShape[0]; + _kernelH = ohwiShape.Height(); + _kernelW = ohwiShape.Width(); + + // Bring in values for better cache locality + _ohwiStrides[0] = ohwiStrides[0] * (separated ? _streams : 1); + _ohwiStrides[1] = ohwiStrides[1]; + _ohwiStrides[2] = ohwiStrides[2]; + _ohwiStrides[3] = ohwiStrides[3]; + } + + int Elements() override { return _ofmDepth * _ifmDepth * _kernelH * _kernelW; } + + inline int WeightIndex(int ofm_z, int wy, int wx, int ifm_z) const + { + return ofm_z * _ohwiStrides[0] + wy * _ohwiStrides[1] + wx * _ohwiStrides[2] + ifm_z * _ohwiStrides[3]; + } +}; + +struct WeightEncodeException : public std::runtime_error +{ + WeightEncodeException() : std::runtime_error("weight encode") {} +}; + +struct WeightsNotSparse : public WeightEncodeException +{ + WeightsNotSparse() {} +}; + +} // namespace regor diff --git a/ethosu/regor/bindings/python/py_regor.cpp b/ethosu/regor/bindings/python/py_regor.cpp new file mode 100644 index 00000000..af8a2a51 --- /dev/null +++ b/ethosu/regor/bindings/python/py_regor.cpp @@ -0,0 +1,587 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "include/regor_interface.hpp" + +#include +#include +#include +#include +#include +#include + +#include "include/regor.h" + +#define _STRINGIFY(x) #x +#define STRINGIFY(x) _STRINGIFY(x) + +namespace py = pybind11; + +struct PyRegorMemoryAccess +{ + std::string accessType; + int64_t bytesRead = 0; + int64_t bytesWritten = 0; + int64_t accessCycles = 0; + + std::string ToString() const + { + std::string ret; + ret += "Access Type[" + accessType + "]\n"; + ret += "\tread = " + std::to_string(bytesRead) + "\n"; + ret += "\twrite = " + std::to_string(bytesWritten) + "\n"; + ret += "\tcycles = " + std::to_string(accessCycles) + "\n"; + return ret; + } +}; + +struct PyRegorMemoryPerf +{ + std::string memoryName; + int64_t peakUsage = 0; + std::unordered_map accesses; + + std::string ToString() const + { + std::string ret; + ret += "Memory[" + memoryName + "]\n"; + ret += "\tPeak usage = " + std::to_string(peakUsage) + "\n"; + return ret; + } +}; + +struct PyRegorPerfReport +{ + int64_t npuCycles = 0; + int64_t cpuCycles = 0; + int64_t totalCycles = 0; + int64_t macCount = 0; + int64_t cpuOps = 0; + int64_t npuOps = 0; + int64_t cascadedOps = 0; + int64_t cascades = 0; + int64_t originalWeights = 0; + int64_t encodedWeights = 0; + std::string stagingMemoryArea = ""; + + std::unordered_map memories; + + std::string ToString() const + { + assert(npuOps >= 0); + assert(cpuOps >= 0); + assert(cascadedOps >= 0); + assert(cascades >= 0); + + assert(npuOps <= (std::numeric_limits::max() - cpuOps)); + auto totalOpsBc = npuOps + cpuOps; + assert(totalOpsBc >= cascadedOps); + assert((totalOpsBc - cascadedOps) <= (std::numeric_limits::max() - cascades)); + + std::string ret; + ret += "NPU Cycles = " + std::to_string(npuCycles) + "\n"; + ret += "CPU Cycles = " + std::to_string(cpuCycles) + "\n"; + ret += "Total Cycles = " + std::to_string(totalCycles) + "\n"; + ret += "Total MACs = " + std::to_string(macCount) + "\n"; + ret += "CPU Operations = " + std::to_string(cpuOps) + "\n"; + ret += "NPU Operations = " + std::to_string(npuOps) + "\n"; + ret += "Total operations (before cascading) = " + std::to_string(totalOpsBc) + "\n"; + ret += "Total operations (after cascading) = " + std::to_string(totalOpsBc - cascadedOps + cascades) + "\n"; + ret += "Original Weights = " + std::to_string(originalWeights) + "\n"; + ret += "Encoded Weights = " + std::to_string(encodedWeights) + "\n"; + for ( const auto &[memName, memory] : memories ) + { + ret += memory.ToString(); + for ( const auto &[_memName, access] : memory.accesses ) + { + ret += access.ToString(); + } + } + return ret; + } +}; + +struct PyRegorDatabaseTable +{ + std::vector header; + std::vector> data; +}; + +struct PyRegorDatabase +{ + std::unordered_map tables; +}; + +struct PyRegorCompiledModel +{ + PyRegorCompiledModel() {} + + void SetPerfReport(PyRegorPerfReport &&report) { perf_report = std::move(report); } + void SetOptDatabase(PyRegorDatabase &&database) { opt_database = std::move(database); } + + PyRegorPerfReport perf_report; + PyRegorDatabase opt_database; +}; + +struct PyRegorCompiledRawModelConstantTensor +{ + PyRegorCompiledRawModelConstantTensor() = default; + PyRegorCompiledRawModelConstantTensor(uint8_t region_, py::bytes data_) : region(region_), data(std::move(data_)) {} + + uint8_t region = 0; + py::bytes data; +}; + +struct PyRegorCompiledRawModelNonConstantTensor +{ + PyRegorCompiledRawModelNonConstantTensor() = default; + PyRegorCompiledRawModelNonConstantTensor(int64_t region_, int64_t address_, uint32_t size_, uint8_t element_size_, std::vector &shape_) : + region(region_), address(address_), size(size_), element_size(element_size_), shape(shape_) + { + } + + uint8_t region = 0; + uint64_t address = 0; + uint32_t size = 0; + uint8_t element_size = 0; + std::vector shape; +}; + +struct PyRegorCompiledRawModel : PyRegorCompiledModel +{ + PyRegorCompiledRawModel() : PyRegorCompiledModel() {} + + py::bytes command_stream; + PyRegorCompiledRawModelConstantTensor read_only; + PyRegorCompiledRawModelNonConstantTensor scratch; + PyRegorCompiledRawModelNonConstantTensor scratch_fast; + std::vector inputs; + std::vector outputs; +}; + +struct PyRegorCompiledTFLiteModel : PyRegorCompiledModel +{ + PyRegorCompiledTFLiteModel() : PyRegorCompiledModel(), model(py::none()) {} + PyRegorCompiledTFLiteModel(py::object model_) : PyRegorCompiledModel(), model(std::move(model_)) {} + + py::object model; +}; + +class PyRegor +{ +public: + PyRegor(const std::string &arch) + { + if ( !regor_create(&_context, arch.c_str()) ) + { + throw std::invalid_argument("Unknown architecture " + arch); + } + std::cout.setf(std::ios::unitbuf); + std::cout.flush(); + regor_set_logging(&PyRegor::Log, 0); + } + + ~PyRegor() { regor_destroy(_context); } + + void SetSystemConfig(const std::string &config) + { + if ( !regor_set_system_config(_context, config.c_str(), config.size()) ) + { + throw std::invalid_argument("Invalid System Config"); + } + } + + void SetCompilerOptions(const std::string &options) + { + if ( !regor_set_compiler_options(_context, options.c_str(), options.size()) ) + { + throw std::invalid_argument("Invalid Compiler Options"); + } + } + + py::object PyCompile(py::bytes &input, const std::string &fmt, bool verbose) + { + // Extract input buffer and size of input buffer + py::buffer_info info(py::buffer(input).request()); + const void *in_data = reinterpret_cast(info.ptr); + size_t in_size = size_t(std::max(info.size, 0)); + + // Compile the input buffer and return a subclass of PyRegorCompiledModel + return Compile(in_data, in_size, fmt, verbose); + } + + PyRegorPerfReport GetPerfReport() + { + PyRegorPerfReport pyPerf; + regor_perf_report_t report; + int rStatus; + (void)(rStatus); + report.access = nullptr; + + rStatus = regor_get_perf_report(_context, &report); + assert(rStatus); + // Parse accessCount and call again to populate access + std::vector access; + if ( report.accessCount > 0 ) + { + access.resize(report.accessCount); + } + + report.access = access.data(); + rStatus = regor_get_perf_report(_context, &report); + assert(rStatus); + + pyPerf.npuCycles = report.npuCycles; + pyPerf.cpuCycles = report.cpuCycles; + pyPerf.totalCycles = report.totalCycles; + pyPerf.macCount = report.macCount; + pyPerf.cpuOps = report.cpuOps; + pyPerf.npuOps = report.npuOps; + pyPerf.cascadedOps = report.cascadedOps; + pyPerf.cascades = report.cascades; + pyPerf.originalWeights = report.originalWeights; + pyPerf.encodedWeights = report.encodedWeights; + if ( report.npuOps > 0 ) + { + assert(report.numMemories <= int(std::size(report.peakUsages))); + assert(report.stagingMemory >= 0 && report.stagingMemory < report.numMemories); + pyPerf.stagingMemoryArea = report.peakUsages[report.stagingMemory].memoryName; + for ( int i = 0; i < report.numMemories; ++i ) + { + std::string name = report.peakUsages[i].memoryName; + pyPerf.memories[name].memoryName = report.peakUsages[i].memoryName; + pyPerf.memories[name].peakUsage = report.peakUsages[i].peakUsage; + } + for ( const auto &acc : access ) + { + std::string name = acc.memoryName; + pyPerf.memories[name].accesses[name] = {acc.accessType, acc.bytesRead, acc.bytesWritten, acc.accessCycles}; + } + } + return pyPerf; + } + + PyRegorDatabase GetOptDatabase() + { + regor::IRegorReporting *reporting = regor_get_reporting_interface(_context); + assert(reporting); + regor::IDatabase *db = reporting->OptimiserDatabase(); + return ToPyDatabase(db); + } + +private: + static void Log(const void *data, size_t size) + { + assert(size <= std::numeric_limits::max()); + std::cout.write(reinterpret_cast(data), size); + } + + PyRegorDatabase ToPyDatabase(regor::IDatabase *db) + { + PyRegorDatabase pyDb; + if ( db != nullptr ) + { + regor::ITableIterator *table = db->Tables(); + while ( table->Next() ) + { + // table name + std::string name = table->Name(); + PyRegorDatabaseTable pyDbTable; + + // header + regor::IRowIterator *row = table->ColumnNames(); + bool isIndexed = (row->Id() > 0); + if ( isIndexed ) pyDbTable.header.push_back("id"); + while ( row->Next() ) + { + pyDbTable.header.push_back(row->Value()); + } + row->Release(); + + // data + int rowCount = table->Rows(); + for ( int i = 0; i < rowCount; i++ ) + { + std::vector data; + row = table->Row(i); + if ( isIndexed ) data.push_back(std::to_string(row->Id())); + while ( row->Next() ) + { + data.push_back(row->Value()); + } + row->Release(); + pyDbTable.data.push_back(data); + } + pyDb.tables[name] = std::move(pyDbTable); + } + table->Release(); + } + return pyDb; + } + + // Internal helper + py::object Compile(const void *input, size_t in_size, const std::string &_fmt, bool verbose) + { + int rStatus; + (void)(rStatus); + + // Capture these + regor_set_logging(&PyRegor::Log, verbose ? ~0u : 0u); + rStatus = regor_set_callback_arg(_context, this); + assert(rStatus); + + regor_format_t fmt = + _fmt == "TFLITE" ? REGOR_INPUTFORMAT_TFLITE : + _fmt == "TOSA" ? REGOR_INPUTFORMAT_TOSA : + REGOR_INPUTFORMAT_GRAPHAPI; + + if ( !regor_compile(_context, fmt, input, in_size, nullptr) ) + { + std::string last_error; + size_t last_error_len = 0; + + // Measure error length + if ( !regor_get_error(_context, nullptr, &last_error_len) ) + { + throw std::runtime_error("Compilation failed: Failed to fetch error"); + } + last_error.resize(last_error_len); + + if ( !regor_get_error(_context, last_error.data(), &last_error_len) ) + { + throw std::runtime_error("Compilation failed: Failed to fetch error"); + } + else + { + throw std::runtime_error("Compilation failed: " + last_error); + } + } + + // Get all compiler output + std::vector blobs; + { + regor::IRegorBlob *blob; + while ( regor_get_output(_context, &blob) ) + { + if ( blob ) blobs.push_back(blob); + } + } + + if ( blobs.size() == 1 ) + { + // Likely TFLite output + + assert(fmt == REGOR_INPUTFORMAT_TFLITE && "Only 1 output blob expected for TFLite input"); + + PyRegorCompiledTFLiteModel tfl; + tfl.SetPerfReport(GetPerfReport()); + tfl.SetOptDatabase(GetOptDatabase()); + + int64_t size; + void *buf = blobs[0]->Map(size); + tfl.model = py::bytes(reinterpret_cast(buf), size); + blobs[0]->Unmap(buf); + blobs[0]->Release(); + + return py::cast(tfl); + } + else if ( blobs.size() > 1 ) + { + // Likely raw output + + PyRegorCompiledRawModel raw; + raw.SetPerfReport(GetPerfReport()); + raw.SetOptDatabase(GetOptDatabase()); + + for ( auto &blob : blobs ) + { + int64_t size; + char *buf = reinterpret_cast(blob->Map(size)); + + regor_raw_tensor_header_t header; + std::copy_n(buf, sizeof(header), reinterpret_cast(&header)); + + char *data; + uint32_t data_size; + uint8_t region; + uint64_t address; + uint8_t element_size; + std::vector shape; + + switch ( header.type ) + { + case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_COMMAND_STREAM: + data = buf + sizeof(header); + data_size = header.tensor.command_stream.size; + raw.command_stream = py::bytes(data, data_size); + break; + case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_READ_ONLY: + data = buf + sizeof(header); + data_size = header.tensor.read_only.size; + raw.read_only.region = header.tensor.read_only.region; + raw.read_only.data = py::bytes(data, data_size); + break; + case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_SCRATCH: + raw.scratch.region = header.tensor.scratch.region; + raw.scratch.size = header.tensor.scratch.size; + raw.scratch.address = header.tensor.scratch.address; + break; + case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_SCRATCH_FAST: + raw.scratch_fast.region = header.tensor.scratch_fast.region; + raw.scratch_fast.size = header.tensor.scratch_fast.size; + raw.scratch_fast.address = header.tensor.scratch_fast.address; + break; + case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_INPUT: + region = header.tensor.input.region; + address = header.tensor.input.address; + data_size = header.tensor.input.size; + element_size = header.tensor.input.element_size; + shape.insert(shape.end(), header.tensor.input.shape, header.tensor.input.shape + 4); + raw.inputs.emplace_back(region, address, size, element_size, shape); + break; + case regor_raw_tensor_header_t::RAW_TENSOR_TYPE_OUTPUT: + region = header.tensor.output.region; + address = header.tensor.output.address; + data_size = header.tensor.output.size; + element_size = header.tensor.output.element_size; + shape.insert(shape.end(), header.tensor.output.shape, header.tensor.output.shape + 4); + raw.outputs.emplace_back(region, address, size, element_size, shape); + break; + default: + break; + } + + blob->Unmap(buf); + blob->Release(); + } + + return py::cast(raw); + } + else + { + throw std::runtime_error("Compilation generated no output blobs"); + } + } + + regor_context_t _context; +}; + +PYBIND11_MODULE(regor, m) +{ + m.doc() = R"pbdoc( + Welcome to Regor - the brightest star in Vela + )pbdoc"; + + m.attr("__version__") = STRINGIFY(REGOR_VERSION); + + py::class_(m, "MemoryAccess", "Regor memory accesses") + .def(py::init<>()) + .def_readwrite("accessType", &PyRegorMemoryAccess::accessType, "Access type") + .def_readwrite("bytesRead", &PyRegorMemoryAccess::bytesRead, "Bytes read") + .def_readwrite("bytesWritten", &PyRegorMemoryAccess::bytesWritten, "Bytes written") + .def_readwrite("accessCycles", &PyRegorMemoryAccess::accessCycles, "Total access cycles") + .def("__repr__", &PyRegorMemoryAccess::ToString); + py::class_(m, "MemoryPerf", "A Regor memory performance report") + .def(py::init<>()) + .def_readwrite("memoryName", &PyRegorMemoryPerf::memoryName, "Memory name") + .def_readwrite("peakUsage", &PyRegorMemoryPerf::peakUsage, "Peak usage") + .def_readwrite("accesses", &PyRegorMemoryPerf::accesses, "Accesses") + .def("__repr__", &PyRegorMemoryPerf::ToString); + py::class_(m, "PerfReport", "A Regor performance report") + .def(py::init<>()) + .def_readwrite("npuCycles", &PyRegorPerfReport::npuCycles, "NPU elapsed cycles") + .def_readwrite("cpuCycles", &PyRegorPerfReport::cpuCycles, "CPU elapsed cycles") + .def_readwrite("totalCycles", &PyRegorPerfReport::totalCycles, "Total elapsed time in cycles") + .def_readwrite("macCount", &PyRegorPerfReport::macCount, "Number of Multiply-Accumulate operations") + .def_readwrite("cpuOps", &PyRegorPerfReport::cpuOps, "Number of CPU operations") + .def_readwrite("npuOps", &PyRegorPerfReport::npuOps, "Number of NPU operations") + .def_readwrite("cascadedOps", &PyRegorPerfReport::cascadedOps, "Number of cascaded operations") + .def_readwrite("cascades", &PyRegorPerfReport::cascades, "Number of cascades") + .def_readwrite("originalWeights", &PyRegorPerfReport::originalWeights, "Weights size (uncompressed)") + .def_readwrite("encodedWeights", &PyRegorPerfReport::encodedWeights, "Weights size (compressed)") + .def_readwrite("stagingMemoryArea", &PyRegorPerfReport::stagingMemoryArea, "Staging memory area") + .def_readwrite("memories", &PyRegorPerfReport::memories, "Memory performance report") + .def("__repr__", &PyRegorPerfReport::ToString); + + py::class_(m, "DatabaseTable", "A regor database table") + .def(py::init<>()) + .def_readwrite("header", &PyRegorDatabaseTable::header, "database headers") + .def_readwrite("data", &PyRegorDatabaseTable::data, "database data"); + + py::class_(m, "Database", "A regor database").def(py::init<>()).def_readwrite("tables", &PyRegorDatabase::tables, "database tables"); + + py::class_(m, "Regor", "The main Regor compiler class") + .def(py::init()) + .def("SetSystemConfig", &PyRegor::SetSystemConfig, "Set the system configuration") + .def("SetCompilerOptions", &PyRegor::SetCompilerOptions, "Set compiler options") + .def("Compile", &PyRegor::PyCompile, "Compile the input model into a TFLite Flatbuffer", py::arg("input"), + py::arg("fmt"), py::arg("verbose") = false) + .def("GetPerfReport", &PyRegor::GetPerfReport, "Get the performance report for the latest compiled model") + .def("GetOptDatabase", &PyRegor::GetOptDatabase, "Get the optimiser database for the latest compiled model"); + + py::class_(m, "CompiledRawModelNonConstantTensor", "A non-constant tensor of a Regor-compiled model in raw format") + .def(py::init<>()) + .def_readwrite("region", &PyRegorCompiledRawModelNonConstantTensor::region, "The tensor's region") + .def_readwrite("address", &PyRegorCompiledRawModelNonConstantTensor::address, "The tensor's address") + .def_readwrite("size", &PyRegorCompiledRawModelNonConstantTensor::size, "The tensor's size") + .def_readwrite("element_size", &PyRegorCompiledRawModelNonConstantTensor::element_size, "The tensor's element size") + .def_readwrite("shape", &PyRegorCompiledRawModelNonConstantTensor::shape, "The tensor's shape"); + + py::class_(m, "CompiledRawModelConstantTensor", "A constant tensor of a Regor-compiled model in raw format") + .def(py::init<>()) + .def_readwrite("region", &PyRegorCompiledRawModelConstantTensor::region, "The tensor's region") + .def_readwrite("data", &PyRegorCompiledRawModelConstantTensor::data, "The tensor's constant data"); + + py::class_(m, "CompiledModel", "A Regor-compiled model") + .def(py::init<>()) + .def_readwrite("perf_report", &PyRegorCompiledTFLiteModel::perf_report, "The performance report for the compiled model") + .def_readwrite("opt_database", &PyRegorCompiledTFLiteModel::opt_database, "The optimiser database for the compiled model"); + + py::class_(m, "CompiledRawModel", "A Regor-compiled model in raw format") + .def(py::init<>()) + .def_readwrite("command_stream", &PyRegorCompiledRawModel::command_stream, "The compiled model command stream") + .def_readwrite("read_only", &PyRegorCompiledRawModel::read_only, "The compiled model weights") + .def_readwrite("scratch", &PyRegorCompiledRawModel::scratch, "The compiled model scratch area") + .def_readwrite("scratch_fast", &PyRegorCompiledRawModel::scratch_fast, "The compiled model scratch fast area") + .def_readwrite("inputs", &PyRegorCompiledRawModel::inputs, "The compiled model inputs") + .def_readwrite("outputs", &PyRegorCompiledRawModel::outputs, "The compiled model outputs"); + + py::class_(m, "CompiledTFLiteModel", "A Regor-compiled TFLite model") + .def(py::init<>()) + .def_readwrite("model", &PyRegorCompiledTFLiteModel::model, "The compiled model TFLite blob"); + + m.def( + "compile", + [](const std::string &arch, py::bytes &input, const std::string &fmt, const std::string &sysconfig, + const std::string &options = "", bool verbose = false) -> py::object + { + PyRegor pyr(arch); + pyr.SetSystemConfig(sysconfig); + if ( options.size() > 0 ) + { + pyr.SetCompilerOptions(options); + } + + return pyr.PyCompile(input, fmt, verbose); + }, + R"pbdoc( + Compile a model + Returns a compiled model + )pbdoc", + py::arg("arch"), py::arg("input"), py::arg("fmt"), py::arg("sysconfig"), py::arg("options") = "", py::arg("verbose") = false); +} diff --git a/ethosu/regor/cmake/cpack_config.cmake b/ethosu/regor/cmake/cpack_config.cmake new file mode 100644 index 00000000..24042e1f --- /dev/null +++ b/ethosu/regor/cmake/cpack_config.cmake @@ -0,0 +1,83 @@ +# +# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# only include last at the top level +if (NOT "${CMAKE_CURRENT_SOURCE_DIR}" STREQUAL "${CMAKE_SOURCE_DIR}") + return() +endif() + +include(utils) +include(InstallRequiredSystemLibraries) + +# Use a PEP-656 compliant package tag +# The default value for this variable is not useful +if (CMAKE_CROSSCOMPILING) + if (CMAKE_SYSTEM_NAME STREQUAL "Linux" AND CMAKE_SYSTEM_PROCESSOR STREQUAL "i386") + set(REGOR_SYSTEM_NAME "linux-i586") + else() + message(FATAL_ERROR "Unknown cross-compile system") + endif() +else() + utils_find_python() + execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_platform())" + OUTPUT_VARIABLE REGOR_SYSTEM_NAME OUTPUT_STRIP_TRAILING_WHITESPACE) +endif() + +set(CPACK_PACKAGE_NAME ${REGOR_PACKAGE_NAME}) + +if (NOT CPACK_PROJECT_NAME) + # Can be overriden + set(CPACK_PROJECT_NAME ${CMAKE_PROJECT_NAME}) +endif() +if (NOT CPACK_PACKAGE_NAME) + # Can be overriden + set(CPACK_PACKAGE_NAME ${CPACK_PROJECT_NAME}) +endif() +if (NOT CPACK_COMPONENT_NAME) + # Can be overriden + set(CPACK_COMPONENT_NAME ${CPACK_PROJECT_NAME}) +endif() + +set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${REGOR_SYSTEM_NAME}) + +# Default variables +set(CPACK_PACKAGE_VENDOR "Arm") +set(CPACK_PACKAGE_DESCRIPTION "${${CPACK_PROJECT_NAME}_DESCRIPTION}") +set(CPACK_PACKAGE_VERSION_MAJOR "${${CPACK_PROJECT_NAME}_VERSION_MAJOR}") +set(CPACK_PACKAGE_VERSION_MINOR "${${CPACK_PROJECT_NAME}_VERSION_MINOR}") +if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + set(CPACK_STRIP_FILES FALSE) +else() + set(CPACK_STRIP_FILES TRUE) +endif() +set(CPACK_VERBATIM_VARIABLES TRUE) + +# Archive generator setup +set(CPACK_BINARY_TGZ ON) +set(CPACK_BINARY_STGZ OFF) +set(CPACK_BINARY_TBZ2 OFF) +set(CPACK_BINARY_TXZ OFF) +set(CPACK_BINARY_TZ OFF) + +# Collect all exported targets +set(CPACK_INSTALL_CMAKE_PROJECTS + "${CMAKE_CURRENT_BINARY_DIR};${CPACK_PROJECT_NAME};${CPACK_COMPONENT_NAME};/") + +# Include CPack last +include(CPack) diff --git a/ethosu/regor/cmake/pkg-config.cmake.in b/ethosu/regor/cmake/pkg-config.cmake.in new file mode 100644 index 00000000..70c4ced5 --- /dev/null +++ b/ethosu/regor/cmake/pkg-config.cmake.in @@ -0,0 +1,26 @@ +# +# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +@PACKAGE_INIT@ + +set_and_check(TARGETS_FILE + "${CMAKE_CURRENT_LIST_DIR}/@PACKAGE_NAME@-targets.cmake") + +include(${TARGETS_FILE}) + +check_required_components(@PACKAGE_NAME@) diff --git a/ethosu/regor/cmake/regor_dependencies.cmake b/ethosu/regor/cmake/regor_dependencies.cmake new file mode 100644 index 00000000..c971da02 --- /dev/null +++ b/ethosu/regor/cmake/regor_dependencies.cmake @@ -0,0 +1,82 @@ +# +# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +include_guard(GLOBAL) + +include(regor_lib) + +############################################################################ +# An add_subdirectory wrapper fixing up imported targets +############################################################################ + +function(regor_add_dependency dir) + add_subdirectory(${dir} ${ARGN}) + + # Get all target dirs + set(to_visit ${dir}) + set(total_dirs ${dir}) + while (to_visit) + set(_to_visit) + foreach (d IN LISTS to_visit) + get_directory_property(d_dirs DIRECTORY ${d} SUBDIRECTORIES) + if (d_dirs) + list(APPEND _to_visit ${d_dirs}) + endif() + endforeach() + list(REMOVE_DUPLICATES _to_visit) + if (_to_visit) + list(APPEND total_dirs ${_to_visit}) + endif() + set(to_visit ${_to_visit}) + endwhile() + list(REMOVE_DUPLICATES total_dirs) + + # Fix all targets + foreach (d IN LISTS total_dirs) + get_directory_property(bdir_targets DIRECTORY ${d} BUILDSYSTEM_TARGETS) + foreach (bdir_target IN LISTS bdir_targets) + # Skip custom targets + get_target_property(tp ${bdir_target} TYPE) + if ("${tp}" STREQUAL "UTILITY") + continue() + endif() + + # Add default flags + regor_add_options(${bdir_target}) + + # Set include paths as system + utils_set_system_include_paths(${bdir_target}) + + # Disable diagnostics + utils_disable_warnings(${bdir_target}) + + # Exclude from ALL. This can't be done directly in add_subdirectory + if (NOT "${tp}" STREQUAL "INTERFACE_LIBRARY") + set_target_properties(${bdir_target} PROPERTIES EXCLUDE_FROM_ALL TRUE) + endif() + endforeach() + endforeach() +endfunction() + +############################################################################ +# Add internal and thirdparty dependencies +############################################################################ + +regor_add_dependency("${CMAKE_CURRENT_SOURCE_DIR}/dependencies/mlw_codec") + +include(regor_thirdparty) diff --git a/ethosu/regor/cmake/regor_lib.cmake b/ethosu/regor/cmake/regor_lib.cmake new file mode 100644 index 00000000..512c1695 --- /dev/null +++ b/ethosu/regor/cmake/regor_lib.cmake @@ -0,0 +1,312 @@ +# +# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +include_guard(GLOBAL) + +include(GNUInstallDirs) +include(utils) +include(regor_options) +include(CMakePackageConfigHelpers) + + +# This function installs a target in a component with a namespace and produces a CMake export +# which links both the namespace and the component. +# An optional include sub-path can be specified for public headers under the component include +# path. This can be useful for "sub-component" +function(regor_install) + cmake_parse_arguments(_ + "" + "COMPONENT;NAMESPACE;TARGET;INCLUDE" + "" + ${ARGN}) + + if (NOT __COMPONENT) + return() + endif() + + install(TARGETS ${__TARGET} + EXPORT ${__COMPONENT}-targets + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" COMPONENT ${__COMPONENT} + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" COMPONENT ${__COMPONENT} + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" COMPONENT ${__COMPONENT} + PUBLIC_HEADER DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${__COMPONENT}/${__INCLUDE}" COMPONENT ${__COMPONENT} + ) + get_target_property(tp ${__TARGET} TYPE) + if (MSVC AND "${tp}" STREQUAL "STATIC_LIBRARY" AND NOT CMAKE_VERSION VERSION_LESS 3.15) + install(FILES + "$/$$.pdb" + DESTINATION ${CMAKE_INSTALL_LIBDIR} + COMPONENT ${__COMPONENT} + OPTIONAL) + endif() + # Emit export + if (NOT TARGET install-${__COMPONENT}) + install(EXPORT ${__COMPONENT}-targets + FILE ${__COMPONENT}-targets.cmake + NAMESPACE ${__NAMESPACE}:: + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${__COMPONENT}" + COMPONENT ${__COMPONENT}) + + # Convenience target + add_custom_target(install-${__COMPONENT} + COMMAND + "${CMAKE_COMMAND}" -DCMAKE_INSTALL_COMPONENT=${__COMPONENT} + -P "${CMAKE_CURRENT_BINARY_DIR}/cmake_install.cmake") + + # Package definition. + # These calls are wrappers around configure_file producing standard + # files to be employed by client code using find_package(${__COMPONENT}) + # They produce a relocatable package + set(PACKAGE_NAME ${__COMPONENT}) + configure_package_config_file(${REGOR_SOURCE_DIR}/cmake/pkg-config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/${__COMPONENT}-config.cmake + INSTALL_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${__COMPONENT}" + PATH_VARS PACKAGE_NAME) + unset(PACKAGE_NAME) + write_basic_package_version_file( + ${CMAKE_CURRENT_BINARY_DIR}/${__COMPONENT}-config-version.cmake + VERSION "${${PROJECT_NAME}_VERSION}" + COMPATIBILITY AnyNewerVersion + ) + + # Install for the produced configure_files above + install(FILES + "${CMAKE_CURRENT_BINARY_DIR}/${__COMPONENT}-config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/${__COMPONENT}-config-version.cmake" + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${__COMPONENT}" + COMPONENT ${__COMPONENT}) + + # This produces a non-relocatable "quick" package that can be directly + # consumed after a build + export(EXPORT ${__COMPONENT}-targets + FILE ${CMAKE_CURRENT_BINARY_DIR}/${__COMPONENT}-targets.cmake) + endif() + add_dependencies(install-${__COMPONENT} ${__TARGET}) +endfunction() + +# This function implements an add_library wrapper for non-interface libraries. +# It makes sure: +# - flags are properly set +# - Install exports is done when the COMPONENT option is present +# +# Note all settings have a default scope pinned to PRIVATE except for OBJECT +# libraries and PUBLIC_HEADERS. Extra settings outside the scope of this function +# can be done to the target following the call +function(regor_lib) + cmake_parse_arguments(_ + "EXCLUDE_FROM_ALL" + "NAME;COMPONENT;TYPE;OUTPUT_NAME;INSTALL_LOCATION" + "PUBLIC_HEADERS;SOURCES;COPTS;DEFINES;LOPTS;DEPS;INC_DIRS" + ${ARGN}) + + if ("${__TYPE}" STREQUAL "STATIC") + set(__default_scope PRIVATE) + set(__is_dll FALSE) + elseif ("${__TYPE}" STREQUAL "SHARED") + set(__default_scope PRIVATE) + set(__is_dll TRUE) + elseif ("${__TYPE}" STREQUAL "OBJECT") + set(__default_scope PUBLIC) + set(__is_dll FALSE) + elseif ("${__TYPE}" STREQUAL "PY_MODULE") + set(__default_scope PRIVATE) + set(__is_dll TRUE) + else() + message(FATAL_ERROR "Unexpected lib type ${__TYPE}") + endif() + + if ("${__TYPE}" STREQUAL "PY_MODULE") + pybind11_add_module(${__NAME} ${__SOURCES}) + elseif (__EXCLUDE_FROM_ALL) + add_library(${__NAME} ${__TYPE} EXCLUDE_FROM_ALL ${__SOURCES}) + else() + add_library(${__NAME} ${__TYPE} ${__SOURCES}) + endif() + if (__OUTPUT_NAME) + if (MSVC AND "${__TYPE}" STREQUAL "SHARED") + # MSVC generates static artifacts for DLLs + set_target_properties(${__NAME} PROPERTIES RUNTIME_OUTPUT_NAME + ${__OUTPUT_NAME}) + set_target_properties(${__NAME} PROPERTIES ARCHIVE_OUTPUT_NAME + ${__OUTPUT_NAME}_st) + else() + set_target_properties(${__NAME} PROPERTIES OUTPUT_NAME + ${__OUTPUT_NAME}) + endif() + endif() + + foreach (dep IN LISTS __DEPS) + # Workaround for https://gitlab.kitware.com/cmake/cmake/-/issues/15415 + # Recurse deps + set(to_visit ${dep}) + set(total_deps ${dep}) + while (to_visit) + set(_to_visit) + foreach (tgt IN LISTS to_visit) + # Imported libs and non-targets are linked as usual + set(no_target FALSE) + set(imported FALSE) + if (TARGET ${tgt}) + get_target_property(imported ${tgt} IMPORTED) + get_target_property(tp ${tgt} TYPE) + else() + set(no_target TRUE) + endif() + if (no_target OR imported OR (__is_dll AND NOT "${tp}" STREQUAL "OBJECT_LIBRARY")) + target_link_libraries(${__NAME} ${__default_scope} ${tgt}) + list(REMOVE_ITEM total_deps ${tgt}) + continue() + endif() + + get_target_property(ideps ${tgt} INTERFACE_LINK_LIBRARIES) + if (NOT ideps) + continue() + endif() + + foreach(idep IN LISTS ideps) + # Strip LINK_ONLY + string(REGEX REPLACE "^\\$$" "\\1" ridep "${idep}") + # Collect this target + list(APPEND total_deps ${ridep}) + list(APPEND _to_visit ${ridep}) + endforeach() + endforeach() + list(REMOVE_DUPLICATES _to_visit) + set(to_visit ${_to_visit}) + endwhile() + list(REMOVE_DUPLICATES total_deps) + + # Now "link" collected target dependencies + foreach (tgt IN LISTS total_deps) + get_target_property(tp ${tgt} TYPE) + if (NOT "${tp}" STREQUAL "INTERFACE_LIBRARY") + target_sources(${__NAME} ${__default_scope} $) + endif() + # Hand-link interface + target_link_options(${__NAME} ${__default_scope} $) + target_include_directories(${__NAME} ${__default_scope} $) + target_include_directories(${__NAME} SYSTEM ${__default_scope} $) + target_compile_options(${__NAME} ${__default_scope} $) + target_compile_definitions(${__NAME} ${__default_scope} $) + target_sources(${__NAME} ${__default_scope} $) + endforeach() + endforeach() + + regor_add_options(${__NAME}) + if (__INC_DIRS) + target_include_directories(${__NAME} ${__default_scope} ${__INC_DIRS}) + endif() + if (__DEFINES) + target_compile_definitions(${__NAME} ${__default_scope} ${__DEFINES}) + endif() + if (__COPTS) + target_compile_options(${__NAME} ${__default_scope} ${__COPTS}) + endif() + if (__is_dll AND __LOPTS) + target_link_options(${__NAME} ${__default_scope} ${__LOPTS}) + endif() + + if (NOT "${__TYPE}" STREQUAL "PY_MODULE") + set_target_properties(${__NAME} PROPERTIES + VERSION ${${PROJECT_NAME}_VERSION}) + + if (__PUBLIC_HEADERS) + set_target_properties(${__NAME} PROPERTIES + PUBLIC_HEADER "${__PUBLIC_HEADERS}") + endif() + endif() + foreach (dir IN LISTS __PUBLIC_HEADERS) + get_filename_component(ahdr ${dir} ABSOLUTE) + get_filename_component(dir ${ahdr} DIRECTORY) + target_include_directories(${__NAME} + INTERFACE + "$" + "$/${CMAKE_INSTALL_INCLUDEDIR}>") + endforeach() + + set(__NAMESPACE ${PROJECT_NAME}) + string(TOLOWER ${__NAMESPACE} __NAMESPACE) + if (__INSTALL_LOCATION AND __COMPONENT) + install(TARGETS ${__NAME} + RUNTIME DESTINATION "${__INSTALL_LOCATION}" COMPONENT "${__COMPONENT}" + LIBRARY DESTINATION "${__INSTALL_LOCATION}" COMPONENT "${__COMPONENT}" + ARCHIVE DESTINATION "${__INSTALL_LOCATION}" COMPONENT "${__COMPONENT}" + ) + if (NOT TARGET install-${__COMPONENT}) + add_custom_target(install-${__COMPONENT} + COMMAND + "${CMAKE_COMMAND}" + -DCMAKE_INSTALL_COMPONENT=${__COMPONENT} + -P "${CMAKE_BINARY_DIR}/cmake_install.cmake" + ) + endif() + add_dependencies(install-${__COMPONENT} ${__NAME}) + elseif (NOT "${__TYPE}" STREQUAL "OBJECT") + regor_install( + NAMESPACE ${__NAMESPACE} + COMPONENT ${__COMPONENT} + TARGET ${__NAME}) + endif() + add_library(${__NAMESPACE}::${__NAME} ALIAS ${__NAME}) +endfunction() + +# Same version of the above function for executables +function(regor_exe) + cmake_parse_arguments(_ + "EXCLUDE_FROM_ALL" + "NAME;COMPONENT;OUTPUT_NAME" + "SOURCES;COPTS;DEFINES;LOPTS;DEPS;INC_DIRS" + ${ARGN}) + + set(__default_scope PRIVATE) + + if (__EXCLUDE_FROM_ALL) + add_executable(${__NAME} EXCLUDE_FROM_ALL ${__SOURCES}) + else() + add_executable(${__NAME} ${__SOURCES}) + endif() + if (__OUTPUT_NAME) + set_target_properties(${__NAME} PROPERTIES OUTPUT_NAME + ${__OUTPUT_NAME}) + endif() + regor_add_options(${__NAME}) + if (__INC_DIRS) + target_include_directories(${__NAME} ${__default_scope} ${__INC_DIRS}) + endif() + if (__DEFINES) + target_compile_definitions(${__NAME} ${__default_scope} ${__DEFINES}) + endif() + if (__COPTS) + target_compile_options(${__NAME} ${__default_scope} ${__COPTS}) + endif() + if (__LOPTS) + target_link_options(${__NAME} ${__default_scope} ${__LOPTS}) + endif() + if (__DEPS) + target_link_libraries(${__NAME} ${__default_scope} ${__DEPS}) + endif() + set_target_properties(${__NAME} PROPERTIES + VERSION ${${PROJECT_NAME}_VERSION}) + + set(__NAMESPACE ${PROJECT_NAME}) + string(TOLOWER ${__NAMESPACE} __NAMESPACE) + regor_install( + NAMESPACE ${__NAMESPACE} + COMPONENT ${__COMPONENT} + TARGET ${__NAME}) +endfunction() diff --git a/ethosu/regor/cmake/regor_options.cmake b/ethosu/regor/cmake/regor_options.cmake new file mode 100644 index 00000000..8d22ec85 --- /dev/null +++ b/ethosu/regor/cmake/regor_options.cmake @@ -0,0 +1,279 @@ +# +# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# No include guard - we want the below variables to be set in any including scope + +set(REGOR_DEFAULT_COPTS) +set(REGOR_DEFAULT_DOPTS) +set(REGOR_DEFAULT_LOPTS) + +set(REGOR_C_STANDARD 99) +set(REGOR_C_STANDARD_REQUIRED ON) +set(REGOR_C_EXTENSIONS OFF) +set(REGOR_CXX_STANDARD 17) +set(REGOR_CXX_STANDARD_REQUIRED ON) +set(REGOR_CXX_EXTENSIONS OFF) +set(REGOR_POSITION_INDEPENDENT_CODE ON) +set(REGOR_CXX_VISIBILITY_PRESET hidden) +set(REGOR_VISIBILITY_INLINES_HIDDEN ON) + +# Remove default debug flags from C/CXX flags, this is controlled by REGOR_ENABLE_ASSERT instead +string(TOUPPER ${CMAKE_BUILD_TYPE} UPPER_CONFIG) +string( REGEX REPLACE "[/-]D[;]*[N|_]DEBUG" "" CMAKE_CXX_FLAGS_${UPPER_CONFIG} "${CMAKE_CXX_FLAGS_${UPPER_CONFIG}}") +string( REGEX REPLACE "[/-]D[;]*[N|_]DEBUG" "" CMAKE_C_FLAGS_${UPPER_CONFIG} "${CMAKE_C_FLAGS_${UPPER_CONFIG}}") + +# Check ASSEMBLER/CXX/LINKER flag together with other_flags +# If it checks they all get added to flag_list +function (checked_flag tool flag flag_list) + set(other_flags ${ARGN}) + # Hash a var name for the cache + string(REGEX REPLACE "[ -;=]" "_" var_name "${flag} ${other_flags}") + + # Tool option + if (MSVC) + if ("${tool}" STREQUAL "LINKER") + set(tool_opt "/link") + endif() + else() + if ("${tool}" STREQUAL "LINKER") + set(tool_opt "-Xlinker") + elseif ("${tool}" STREQUAL "ASSEMBLER") + set(tool_opt "-Xassembler") + endif() + endif() + + if (${var_name}_set) + set(flag_not_supported "${${var_name}_val}") + else() + string(REPLACE " " ";" __flags "${flag}") + if (other_flags OR tool_opt) + list(INSERT __flags 0 ${other_flags} ${tool_opt}) + endif() + if (MSVC) + list(APPEND __flags "/link") + list(APPEND __flags "/out:cxx_check.exe") + else() + list(APPEND __flags "-o") + list(APPEND __flags "cxx_check") + endif() + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/cxx_check.cc "int main(void){ return 0; }") + string(REPLACE " " ";" user_args "${CMAKE_CXX_COMPILER_ARG1}") + execute_process(COMMAND ${CMAKE_CXX_COMPILER} cxx_check.cc ${user_args} ${__flags} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + OUTPUT_QUIET ERROR_QUIET + RESULT_VARIABLE flag_not_supported) + set(${var_name}_val "${flag_not_supported}" CACHE INTERNAL "") + set(${var_name}_set TRUE CACHE INTERNAL "") + endif() + if (flag_not_supported) + message(STATUS "Looking for ${tool} flag support (${flag}) - Not found") + else() + message(STATUS "Looking for ${tool} flag support (${flag}) - Success") + string(REPLACE " " ";" flag "${tool_opt} ${flag} ${other_flags}") + set(${flag_list} "${${flag_list}};${flag}" PARENT_SCOPE) + endif() +endfunction() + +function(get_glibc_version var) + execute_process(COMMAND ldd --version + OUTPUT_VARIABLE LDD_STR + OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REPLACE "\n" ";" LDD_LINES ${LDD_STR}) + list(GET LDD_LINES 0 LDD_FIRST_LINE) + string(REPLACE " " ";" LDD_FIRST_LINE_LIST ${LDD_FIRST_LINE}) + list(GET LDD_FIRST_LINE_LIST -1 GLIBC_VER) + set(${var} ${GLIBC_VER} PARENT_SCOPE) + message(STATUS "Looking for GLIBC - Found version ${GLIBC_VER}") +endfunction() + +# Base options +if(MSVC) + # On MSVC, CMake sets /GR by default (enabling RTTI), but we set /GR- + string(REPLACE "/GR" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") + + list(APPEND REGOR_DEFAULT_COPTS + "$<$,$>:/bigobj>" # C1128: number of sections exceeded object file format limit: compile with /bigobj + "$<$,$>,$>:$,/GR,/GR->>" + "$<$,$>:/experimental:external>" + "$<$,$>:/external:W0>" + "$<$,$>:/external:anglebrackets>" + ) +else() + list(APPEND REGOR_DEFAULT_COPTS + "$,-frtti,-fno-rtti>" + "$<$:-ggdb>" + "$<$:-g3>" + "$<$:-fprofile-arcs>" + "$<$:-ftest-coverage>" + "$<$:-fno-omit-frame-pointer>" + # https://gitlab.kitware.com/cmake/cmake/-/issues/23136 + "$<$:$,-ffat-lto-objects,-flto=full>>" + "$<$:-fsanitize=${REGOR_SANITIZE}>" + ) + if (REGOR_SANITIZE) + # There's just too much code being added by the sanitizer + checked_flag(CXX "-fno-var-tracking-assignments" REGOR_DEFAULT_COPTS) + checked_flag(CXX "-fno-sanitize-recover=all" REGOR_DEFAULT_COPTS) + endif() + if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + # -gz via assembler + checked_flag(ASSEMBLER "--compress-debug-sections" REGOR_DEFAULT_COPTS "-fnodebug-types-section") + checked_flag(CXX "-gdwarf-4" REGOR_DEFAULT_COPTS) + endif() +endif() + +# Definitions +list(APPEND REGOR_DEFAULT_DOPTS + "$<$>:NDEBUG>" + "$<$:_DEBUG>" + "$<$:_GLIBCXX_DEBUG>" + "$<$:_GLIBCXX_DEBUG_PEDANTIC>" + "$<$:NOMINMAX>" + "$<$:_CRT_SECURE_NO_WARNINGS>" +) + +# Base link flags +if(NOT MSVC) + if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + checked_flag(LINKER "--compress-debug-sections=${REGOR_DEBUG_COMPRESSION}" gz_supported "-fdebug-types-section") + list(APPEND REGOR_DEFAULT_LOPTS ${gz_supported}) + # Add gold only if compressed sections are supported + checked_flag(LINKER "--compress-debug-sections=${REGOR_DEBUG_COMPRESSION}" gold_gz_supported "-fuse-ld=gold" "-fdebug-types-section") + if (gz_supported AND NOT gold_gz_supported) + set(REGOR_ENABLE_LDGOLD OFF CACHE BOOL "Enable Gold linker if available" FORCE) + endif() + unset(gz_supported) + unset(gold_gz_supported) + endif() + list(APPEND REGOR_DEFAULT_LOPTS + "$<$:-fprofile-arcs>" + "$<$:-fuse-ld=gold>" + "$<$:-s>" + "$<$:-static-libstdc++>" + "$<$:-static-libgcc>" + "$<$:-fsanitize=${REGOR_SANITIZE}>" + ) + if (REGOR_SANITIZE) + checked_flag(LINKER "-static-libubsan" REGOR_DEFAULT_LOPTS) + endif() + list(APPEND REGOR_DEFAULT_LOPTS "-lm") + if (UNIX AND NOT APPLE) + get_glibc_version(_GLIBC_VER) + if (_GLIBC_VER) + if (${_GLIBC_VER} VERSION_LESS 2.17) + list(APPEND REGOR_DEFAULT_LOPTS "-lrt") + endif() + endif() + endif() +endif() + +# Diagnostics +if(MSVC) + list(APPEND REGOR_DEFAULT_COPTS + "/W3" # Default warning level (severe + significant + production quality). + "$<$:/WX>" + "$<$,$>:/wd4200>" # "nonstandard extension used : zero-sized array in struct/union" + "$<$,$>:/wd4018>" # "signed/unsigned mismatch in comparison" + "$<$,$>:/wd4146>" # operator applied to unsigned type, result still unsigned + "$<$,$>:/wd4244>" # possible loss of data + "$<$,$>:/wd4267>" # initializing: possible loss of data + "$<$,$>:/wd4005>" # allow: macro redefinition + "$<$,$>:/wd4065>" # allow: switch statement contains 'default' but no 'case' labels + "$<$,$>:/wd4141>" # allow: inline used more than once + "$<$,$>:/wd4624>" # allow: destructor was implicitly defined as deleted + "$<$,$>:/wd4146>" # operator applied to unsigned type, result still unsigned + "$<$,$>:/wd4244>" # possible loss of data + "$<$,$>:/wd4267>" # initializing: possible loss of data + "$<$,$>:/wd5105>" # allow: macro expansion producing 'defined' has undefined behavior + ) +else() + list(APPEND REGOR_DEFAULT_COPTS + "-Wall" + "-Wextra" + "$<$:-Werror>" + + "-Wdouble-promotion" + "-Wshadow" + "-Wredundant-decls" + "-Wcast-align" + "-Wmissing-declarations" + "-Wmissing-include-dirs" + "-Wswitch-enum" + "-Wswitch-default" + "-Winvalid-pch" + "-Wformat=2" + "-Wmissing-format-attribute" + "-Wformat-nonliteral" + "$<$:-Wold-style-cast>" + "-Wformat-security" + "-Wimplicit-fallthrough" + "$<$:-Wnon-virtual-dtor>" + "$<$>:-Woverloaded-virtual>" # Not working in GCC + "-Wvla" + "-Wformat-nonliteral" + "$<$:-Wlogical-op>" + + # Disabled + "-Wno-switch-enum" # TODO : Switch case in TFLite ops handling + "$<$:-Wno-array-bounds>" # TODO : False positives on Shape operators + "-Wno-unused-function" + "-Wno-unused" + "-Wno-double-promotion" + ) +endif() + +function(regor_add_options tgt) + get_target_property(tp ${tgt} TYPE) + if ("${tp}" STREQUAL "STATIC_LIBRARY") + set(__default_scope PRIVATE) + set(__link FALSE) + elseif ("${tp}" STREQUAL "OBJECT_LIBRARY") + set(__default_scope PUBLIC) + set(__link FALSE) + elseif ("${tp}" STREQUAL "SHARED_LIBRARY") + set(__default_scope PRIVATE) + set(__link TRUE) + elseif ("${tp}" STREQUAL "MODULE_LIBRARY") + set(__default_scope PRIVATE) + set(__link TRUE) + elseif ("${tp}" STREQUAL "EXECUTABLE") + set(__default_scope PRIVATE) + set(__link TRUE) + else() + return() + endif() + + set_target_properties(${tgt} PROPERTIES + C_STANDARD ${REGOR_C_STANDARD} + C_STANDARD_REQUIRED ${REGOR_C_STANDARD_REQUIRED} + C_EXTENSIONS ${REGOR_C_EXTENSIONS} + CXX_STANDARD ${REGOR_CXX_STANDARD} + CXX_STANDARD_REQUIRED ${REGOR_CXX_STANDARD_REQUIRED} + CXX_EXTENSIONS ${REGOR_CXX_EXTENSIONS} + POSITION_INDEPENDENT_CODE ${REGOR_POSITION_INDEPENDENT_CODE} + CXX_VISIBILITY_PRESET ${REGOR_CXX_VISIBILITY_PRESET} + VISIBILITY_INLINES_HIDDEN ${REGOR_VISIBILITY_INLINES_HIDDEN} + INTERPROCEDURAL_OPTIMIZATION ${REGOR_ENABLE_LTO} + INTERPROCEDURAL_OPTIMIZATION_${CMAKE_BUILD_TYPE} ${REGOR_ENABLE_LTO}) + + target_compile_definitions(${tgt} ${__default_scope} ${REGOR_DEFAULT_DOPTS}) + target_compile_options(${tgt} ${__default_scope} ${REGOR_DEFAULT_COPTS}) + if (__link) + target_link_options(${tgt} ${__default_scope} ${REGOR_DEFAULT_LOPTS}) + endif() +endfunction() diff --git a/ethosu/regor/cmake/regor_test.cmake b/ethosu/regor/cmake/regor_test.cmake new file mode 100644 index 00000000..b0995cac --- /dev/null +++ b/ethosu/regor/cmake/regor_test.cmake @@ -0,0 +1,123 @@ +# +# SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# No inclusion guard. We need these set and included in every scope + +set(ENABLE_COVERAGE ${REGOR_ENABLE_COVERAGE} CACHE INTERNAL + "Enable coverage passed to codecov package") + +include(regor_lib) +include(utils) +include(CTest) +include(Catch) + +find_package(codecov) + +if (NOT TARGET check) + # Target to build and run all unit-tests from the top level + add_custom_target(check + COMMAND ${CMAKE_CTEST_COMMAND} -L unit_test --output-on-failure $<$:--verbose> + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + ) +endif() + +# Coverage target +if(REGOR_ENABLE_COVERAGE AND NOT TARGET coverage) + add_custom_target(coverage) + if(DEFINED ENV{CMAKE_BUILD_PARALLEL_LEVEL}) + set(CHECK_JOB_CNT --parallel $ENV{CMAKE_BUILD_PARALLEL_LEVEL}) + endif() + add_custom_command(TARGET coverage POST_BUILD + COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} -t check ${CHECK_JOB_CNT} + COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} -t lcov-capture + COMMAND ${CMAKE_COMMAND} --build ${CMAKE_BINARY_DIR} -t lcov-genhtml + ) +endif() + +function(add_catch_test) + cmake_parse_arguments( + _FARG + "" + "NAME" + "SOURCES;DEPS;COPTS;DEFINES;INC_DIRS" + ${ARGN} + ) + ### Adds an executable unit test and ammends it to the unit tests top target + regor_exe(NAME ${_FARG_NAME} + SOURCES ${_FARG_SOURCES} + COPTS ${_FARG_COPTS} + DEFINES ${_FARG_DEFINES} + INC_DIRS ${_FARG_INC_DIRS} + ) + # Links the individual tests to the build all target + add_dependencies(check ${_FARG_NAME}) + # Tests require introspection + foreach(_dep ${_FARG_DEPS}) + if (NOT TARGET ${_dep}) + continue() + endif() + get_target_property(__incs ${_dep} INCLUDE_DIRECTORIES) + if (__incs) + target_include_directories(${_FARG_NAME} PRIVATE ${__incs}) + endif() + endforeach() + # Link deps + target_link_libraries(${_FARG_NAME} PRIVATE ${_FARG_DEPS}) + # Now finally enable catch + target_link_libraries(${_FARG_NAME} PRIVATE regor::Catch2) + # Valgrind support + if (REGOR_ENABLE_VALGRIND) + find_program(VALGRIND_EXECUTABLE valgrind REQUIRED) + # We hijack CROSSCOMPILING_EMULATOR to get catch_ctest + # to prepend the valgrind command to all executables + set_property(TARGET ${_FARG_NAME} PROPERTY + CROSSCOMPILING_EMULATOR ${VALGRIND_EXECUTABLE} $ENV{VALGRIND_OPTIONS}) + endif() + catch_discover_tests(${_FARG_NAME} PROPERTIES LABELS unit_test) + # Coverage + add_coverage(${_FARG_NAME}) +endfunction() + +function(add_py_test) + cmake_parse_arguments( + _FARG + "" + "NAME" + "SOURCES;DEPS" + ${ARGN} + ) + list(TRANSFORM _FARG_SOURCES PREPEND ${CMAKE_CURRENT_LIST_DIR}/) + add_test(NAME ${_FARG_NAME} + COMMAND ${Python3_EXECUTABLE} -m pytest ${_FARG_SOURCES} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} + ) + foreach(_dep ${_FARG_DEPS}) + list(APPEND _INC_DIRS $) + endforeach() + if(MSVC) + set(_path_sep ";") + else() + set(_path_sep ":") + endif() + string(REPLACE ";" "${_path_sep}" _INC_DIRS "${_INC_DIRS}") + set_tests_properties(${_FARG_NAME} + PROPERTIES + ENVIRONMENT "PYTHONPATH=${_INC_DIRS}${_path_sep}$ENV{PYTHONPATH}" + LABELS unit_test + ) +endfunction() diff --git a/ethosu/regor/cmake/regor_thirdparty.cmake b/ethosu/regor/cmake/regor_thirdparty.cmake new file mode 100644 index 00000000..7ad6aec2 --- /dev/null +++ b/ethosu/regor/cmake/regor_thirdparty.cmake @@ -0,0 +1,49 @@ +# +# SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +include_guard(GLOBAL) + +include(regor_lib) + +############################################################################ +# Header only libs +############################################################################ + +add_library(fmt INTERFACE) +target_compile_definitions(fmt INTERFACE FMT_HEADER_ONLY) +target_include_directories(fmt SYSTEM INTERFACE + $ + $) +add_library(regor::fmt ALIAS fmt) + +set(CATCH2_DIR "dependencies/thirdparty/Catch2") +add_subdirectory(${CATCH2_DIR}) +target_include_directories(Catch2 SYSTEM INTERFACE ${CMAKE_CURRENT_SOURCE_DIR}/${CATCH2_DIR}/src/catch2) +include(Catch) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/${CATCH2_DIR}/CMake") +add_library(regor::Catch2 ALIAS Catch2) + +add_library(flatbuffers INTERFACE) +target_include_directories(flatbuffers SYSTEM INTERFACE + "${CMAKE_CURRENT_SOURCE_DIR}/dependencies/thirdparty/flatbuffers/include") +add_library(regor::flatbuffers ALIAS flatbuffers) + +add_library(gemmlowp INTERFACE) +target_include_directories(gemmlowp SYSTEM INTERFACE + "${CMAKE_CURRENT_SOURCE_DIR}/dependencies/thirdparty/gemmlowp") +add_library(regor::gemmlowp ALIAS gemmlowp) diff --git a/ethosu/regor/cmake/toolchains/clang.cmake b/ethosu/regor/cmake/toolchains/clang.cmake new file mode 100644 index 00000000..2221fb61 --- /dev/null +++ b/ethosu/regor/cmake/toolchains/clang.cmake @@ -0,0 +1,51 @@ +# +# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +include_guard(GLOBAL) + +set(CMAKE_TOOLCHAIN_PREFIX llvm-) + +find_program(_C_COMPILER "clang" REQUIRED) +set(CMAKE_C_COMPILER ${_C_COMPILER}) + +find_program(_CXX_COMPILER "clang++" REQUIRED) +set(CMAKE_CXX_COMPILER ${_CXX_COMPILER}) + +find_program(_CXX_COMPILER_AR "${CMAKE_TOOLCHAIN_PREFIX}ar" REQUIRED) +set(CMAKE_CXX_COMPILER_AR ${_CXX_COMPILER_AR}) +set(CMAKE_AR ${_CXX_COMPILER_AR}) + +find_program(_CXX_COMPILER_RANLIB "${CMAKE_TOOLCHAIN_PREFIX}ranlib" REQUIRED) +set(CMAKE_CXX_COMPILER_RANLIB ${_CXX_COMPILER_RANLIB}) +set(CMAKE_RANLIB ${_CXX_COMPILER_RANLIB}) + +# Find GCC base path and use it +find_program(_GCC_PATH gcc REQUIRED) +get_filename_component(_GCC_PATH ${_GCC_PATH} DIRECTORY) +add_compile_options(--gcc-toolchain=${_GCC_PATH}/..) +add_link_options(--gcc-toolchain=${_GCC_PATH}/..) + +# Add system headers from GCC +execute_process(COMMAND cpp -xc++ -Wp,-v /dev/null OUTPUT_QUIET ERROR_VARIABLE cpp_out) +string(REPLACE " " ";" cpp_out "${cpp_out}") +string(REPLACE "\n" ";" cpp_out "${cpp_out}") +foreach (e ${cpp_out}) + if (IS_DIRECTORY "${e}") + include_directories(SYSTEM ${e}) + endif() +endforeach() diff --git a/ethosu/regor/cmake/toolchains/clang32.cmake b/ethosu/regor/cmake/toolchains/clang32.cmake new file mode 100644 index 00000000..c262f13d --- /dev/null +++ b/ethosu/regor/cmake/toolchains/clang32.cmake @@ -0,0 +1,61 @@ +# +# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +include_guard(GLOBAL) + +set(CMAKE_CROSSCOMPILING TRUE) +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR i386 CACHE STRING "" FORCE) + +set(CMAKE_TOOLCHAIN_PREFIX llvm-) + +find_program(_C_COMPILER "clang" REQUIRED) +set(CMAKE_C_COMPILER ${_C_COMPILER}) + +find_program(_CXX_COMPILER "clang++" REQUIRED) +set(CMAKE_CXX_COMPILER ${_CXX_COMPILER}) + +find_program(_CXX_COMPILER_AR "${CMAKE_TOOLCHAIN_PREFIX}ar" REQUIRED) +set(CMAKE_CXX_COMPILER_AR ${_CXX_COMPILER_AR}) +set(CMAKE_AR ${_CXX_COMPILER_AR}) + +find_program(_CXX_COMPILER_RANLIB "${CMAKE_TOOLCHAIN_PREFIX}ranlib" REQUIRED) +set(CMAKE_CXX_COMPILER_RANLIB ${_CXX_COMPILER_RANLIB}) +set(CMAKE_RANLIB ${_CXX_COMPILER_RANLIB}) + +# Find GCC base path and use it +find_program(_GCC_PATH i686-linux-gnu-gcc) +if (NOT _GCC_PATH) + find_program(_GCC_PATH gcc REQUIRED) +endif() +get_filename_component(_GCC_PATH ${_GCC_PATH} DIRECTORY) +add_compile_options(--gcc-toolchain=${_GCC_PATH}/..) +add_link_options(--gcc-toolchain=${_GCC_PATH}/..) + +# Add system headers from GCC +execute_process(COMMAND cpp -m32 -xc++ -Wp,-v /dev/null OUTPUT_QUIET ERROR_VARIABLE cpp_out) +string(REPLACE " " ";" cpp_out "${cpp_out}") +string(REPLACE "\n" ";" cpp_out "${cpp_out}") +foreach (e ${cpp_out}) + if (IS_DIRECTORY "${e}") + include_directories(SYSTEM ${e}) + endif() +endforeach() + +add_compile_options("-m32" "-march=i686" "-msse" "-msse2" "-mfpmath=sse") +add_link_options("-m32") diff --git a/ethosu/regor/cmake/toolchains/gcc.cmake b/ethosu/regor/cmake/toolchains/gcc.cmake new file mode 100644 index 00000000..5aa821fc --- /dev/null +++ b/ethosu/regor/cmake/toolchains/gcc.cmake @@ -0,0 +1,37 @@ +# +# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +include_guard(GLOBAL) + + +set(CMAKE_TOOLCHAIN_PREFIX "gcc-") +set(CMAKE_TOOLCHAIN_SUFFIX "" CACHE STRING "Toolchain suffix e.g. -8 for GCC8") # Can be -9 for GCC9 or -8 for GCC8 etc +find_program(_C_COMPILER "gcc${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED) + +set(CMAKE_C_COMPILER ${_C_COMPILER}) + +find_program(_CXX_COMPILER "g++${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED) +set(CMAKE_CXX_COMPILER ${_CXX_COMPILER}) + +find_program(_CXX_COMPILER_AR "${CMAKE_TOOLCHAIN_PREFIX}ar${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED) +set(CMAKE_CXX_COMPILER_AR ${_CXX_COMPILER_AR}) +set(CMAKE_AR ${_CXX_COMPILER_AR}) + +find_program(_CXX_COMPILER_RANLIB "${CMAKE_TOOLCHAIN_PREFIX}ranlib${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED) +set(CMAKE_CXX_COMPILER_RANLIB ${_CXX_COMPILER_RANLIB}) +set(CMAKE_RANLIB ${_CXX_COMPILER_RANLIB}) diff --git a/ethosu/regor/cmake/toolchains/gcc32.cmake b/ethosu/regor/cmake/toolchains/gcc32.cmake new file mode 100644 index 00000000..19946a3e --- /dev/null +++ b/ethosu/regor/cmake/toolchains/gcc32.cmake @@ -0,0 +1,55 @@ +# +# SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +include_guard(GLOBAL) + + +set(CMAKE_CROSSCOMPILING TRUE) +set(CMAKE_SYSTEM_NAME Linux) +set(CMAKE_SYSTEM_PROCESSOR i386 CACHE STRING "" FORCE) + +# First look for a cross-compilation toolchain +set(CMAKE_TOOLCHAIN_PREFIX i686-linux-gnu-) +set(CMAKE_TOOLCHAIN_SUFFIX "" CACHE STRING "Toolchain suffix e.g. -8 for GCC8") # Can be -9 for GCC9 or -8 for GCC8 etc + +find_program(_C_COMPILER "${CMAKE_TOOLCHAIN_PREFIX}gcc${CMAKE_TOOLCHAIN_SUFFIX}") +if (NOT _C_COMPILER) + # Assume the main toolchain can manage + set(CMAKE_TOOLCHAIN_PREFIX "") +endif() + +# Find compilers +find_program(_C_COMPILER "${CMAKE_TOOLCHAIN_PREFIX}gcc${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED) +find_program(_CXX_COMPILER "${CMAKE_TOOLCHAIN_PREFIX}g++${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED) + +# Now set prefix for binutils +set(CMAKE_TOOLCHAIN_PREFIX ${CMAKE_TOOLCHAIN_PREFIX}gcc-) + +set(CMAKE_C_COMPILER ${_C_COMPILER}) +set(CMAKE_CXX_COMPILER ${_CXX_COMPILER}) + +find_program(_CXX_COMPILER_AR "${CMAKE_TOOLCHAIN_PREFIX}ar${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED) +set(CMAKE_CXX_COMPILER_AR ${_CXX_COMPILER_AR}) +set(CMAKE_AR ${_CXX_COMPILER_AR}) + +find_program(_CXX_COMPILER_RANLIB "${CMAKE_TOOLCHAIN_PREFIX}ranlib${CMAKE_TOOLCHAIN_SUFFIX}" REQUIRED) +set(CMAKE_CXX_COMPILER_RANLIB ${_CXX_COMPILER_RANLIB}) +set(CMAKE_RANLIB ${_CXX_COMPILER_RANLIB}) + +add_compile_options("-m32" "-march=i686" "-msse" "-msse2" "-mfpmath=sse") +add_link_options("-m32") diff --git a/ethosu/regor/cmake/utils.cmake b/ethosu/regor/cmake/utils.cmake new file mode 100644 index 00000000..f8d11598 --- /dev/null +++ b/ethosu/regor/cmake/utils.cmake @@ -0,0 +1,75 @@ +# +# SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +include_guard(GLOBAL) + +# Anchor the project root folder +set(REGOR_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/.. CACHE INTERNAL "") + +# Flag target interface include paths as SYSTEM to prevent warnings leaking out +function(utils_set_system_include_paths tgt) + get_target_property(__tgt_type ${tgt} TYPE) + if (${__tgt_type} STREQUAL "INTERFACE_LIBRARY") + set(__default_scope INTERFACE) + else() + set(__default_scope PUBLIC) + endif() + get_target_property(itf_inc_dirs ${tgt} INTERFACE_INCLUDE_DIRECTORIES) + if (itf_inc_dirs) + target_include_directories(${tgt} SYSTEM BEFORE ${__default_scope} ${itf_inc_dirs}) + endif() +endfunction() + +# Disable warnings for target +function(utils_disable_warnings tgt) + get_target_property(__tgt_type ${tgt} TYPE) + if (NOT ${__tgt_type} STREQUAL "INTERFACE_LIBRARY") + # Remove previous setting to prevent MSVC warning + get_target_property(copts ${tgt} COMPILE_OPTIONS) + if (copts) + set(new_copts) + foreach (c IN LISTS copts) + if (c MATCHES "^/W[0-9]") + continue() + endif() + list(APPEND new_copts "${c}") + endforeach() + set_target_properties(${tgt} PROPERTIES COMPILE_OPTIONS "${new_copts}") + endif() + target_compile_options(${tgt} PRIVATE "$,,-w>") + endif() +endfunction() + +# Find Python the right way +macro(utils_find_python) + if (NOT Python3_FOUND) + if(CMAKE_VERSION VERSION_LESS "3.18.0") + if (DEFINED ENV{PYTHON_VERSION}) + find_package(Python3 $ENV{PYTHON_VERSION} EXACT COMPONENTS Interpreter Development REQUIRED) + else() + find_package(Python3 COMPONENTS Interpreter Development REQUIRED) + endif() + else() + if (DEFINED ENV{PYTHON_VERSION}) + find_package(Python3 $ENV{PYTHON_VERSION} EXACT COMPONENTS Interpreter Development.Module REQUIRED) + else() + find_package(Python3 COMPONENTS Interpreter Development.Module REQUIRED) + endif() + endif() + endif() +endmacro() diff --git a/ethosu/regor/common/bit_flags.hpp b/ethosu/regor/common/bit_flags.hpp new file mode 100644 index 00000000..15c8646d --- /dev/null +++ b/ethosu/regor/common/bit_flags.hpp @@ -0,0 +1,332 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "common/lexer.hpp" + +#include +#include + +struct EnumNameEntry +{ + unsigned value; + const char *name; +}; + +template +static ENUM OrFlags(ENUM a) +{ + return a; +} + +template +static ENUM OrFlags(ENUM a, ARGS... rest) +{ + return ENUM(typename std::underlying_type::type(OrFlags(std::forward(rest)...)) | typename std::underlying_type::type(a)); +} + +static std::string EnumFlagsToString(unsigned value, const EnumNameEntry *table, int tableLength); +static unsigned StringToEnumFlags(Lexer &lexer, const EnumNameEntry *table, int tableLength); + +/// +/// Enumerated flags wrapper +/// +template +struct Flags +{ + using TYPE = typename std::underlying_type::type; + +private: + TYPE _raw = 0; + +public: + Flags() {} + Flags(ENUM val) : _raw(TYPE(val)) {} + + template + Flags(ENUM val, ARGS... rest) + { + _raw = TYPE(val) | TYPE(OrFlags(std::forward(rest)...)); + } + + explicit Flags(TYPE val) : _raw(val) {} + Flags(const Flags &) = default; + +public: + bool operator==(ENUM val) const { return _raw == TYPE(val); } + bool operator!=(ENUM val) const { return _raw != TYPE(val); } + bool operator<(ENUM val) const { return _raw < TYPE(val); } + operator bool() const { return _raw != 0; } + operator ENUM() const { return ENUM(_raw); } + explicit operator unsigned() const { return unsigned(_raw); } + + Flags &operator=(ENUM val) + { + _raw = TYPE(val); + return *this; + } + Flags &operator=(const Flags &other) + { + _raw = other._raw; + return *this; + } + Flags &operator&=(ENUM val) + { + _raw &= TYPE(val); + return *this; + } + Flags &operator&=(const Flags &other) + { + _raw &= other._raw; + return *this; + } + Flags &operator|=(ENUM val) + { + _raw |= TYPE(val); + return *this; + } + Flags &operator|=(const Flags &other) + { + _raw |= other._raw; + return *this; + } + Flags &operator^=(ENUM val) + { + _raw ^= TYPE(val); + return *this; + } + Flags &operator^=(const Flags &other) + { + _raw ^= other._raw; + return *this; + } + + Flags operator&(ENUM val) const { return Flags(ENUM(_raw & TYPE(val))); } + Flags operator|(ENUM val) const { return Flags(ENUM(_raw | TYPE(val))); } + Flags operator^(ENUM val) const { return Flags(ENUM(_raw ^ TYPE(val))); } + Flags operator~() const { return Flags(ENUM(~_raw)); } + + // Extract non-bitfield item + unsigned GetUInt(ENUM offset, int bits) { return (_raw >> int(offset)) & ((1u << bits) - 1u); } + + // Set multiple flags + Flags &Set(TYPE val) + { + _raw |= TYPE(val); + return *this; + } + Flags &Set(ENUM val) { return Set(TYPE(val)); } + template + Flags &Set(ENUM val, ARGS... rest) + { + return Set(TYPE(val) | TYPE(OrFlags(std::forward(rest)...))); + } + + // Unset multiple flags + Flags &Unset(TYPE val) + { + _raw &= ~val; + return *this; + } + Flags &Unset(ENUM val) { return Unset(TYPE(val)); } + template + Flags &Unset(ENUM val, ARGS... rest) + { + return Unset(TYPE(val) | TYPE(OrFlags(std::forward(rest)...))); + } + + // Test any set + bool Any(TYPE val) const { return (_raw & TYPE(val)) != 0; } + bool Any(ENUM val) const { return Any(TYPE(val)); } + template + bool Any(ENUM val, ARGS... rest) const + { + TYPE mask = TYPE(val) | TYPE(OrFlags(std::forward(rest)...)); + return (_raw & mask) != 0; + } + + // Test all set + bool All(TYPE val) const { return (_raw & TYPE(val)) == TYPE(val); } + bool All(ENUM val) const { return All(TYPE(val)); } + template + bool All(ENUM val, ARGS... rest) const + { + TYPE mask = TYPE(val) | TYPE(OrFlags(std::forward(rest)...)); + return (_raw & mask) == mask; + } + + std::string ToString() const + { + int length = 0; + const EnumNameEntry *table = GetTable(length); + return EnumFlagsToString(_raw, table, length); + } + + void Parse(const std::string &text) + { + int length = 0; + const EnumNameEntry *table = GetTable(length); + Lexer lexer(text.data(), int(text.size())); + _raw = StringToEnumFlags(lexer, table, length); + } + +private: + // Proxy function for type erasure + static const EnumNameEntry *GetTable(int &length) + { + extern const EnumNameEntry *GetEnumTable(ENUM, int &); + length = 0; + return GetEnumTable(ENUM(0), length); + } +}; + +static std::string EnumToString(unsigned value, const EnumNameEntry *table, int tableLength) +{ + auto pos = std::find_if(table, table + tableLength, [&value](const EnumNameEntry &v) { return v.value == value; }); + if ( pos != table + tableLength ) + { + return pos->name; + } + return std::to_string(value); +} + +static std::string EnumFlagsToString(unsigned value, const EnumNameEntry *table, int tableLength) +{ + if ( value == 0 ) return EnumToString(value, table, tableLength); + unsigned mask = 1; + std::string text; + while ( mask <= value ) + { + if ( value & mask ) + { + if ( !text.empty() ) + { + text += '|'; + } + + auto pos = std::find_if( + table, table + tableLength, [&mask](const EnumNameEntry &v) { return v.value == mask; }); + if ( pos != table + tableLength ) + { + text += pos->name; + } + else + { + text += std::to_string(mask); + } + } + mask = mask << 1; + } + return text; +} + +template +static std::string EnumToString(ENUM value) +{ + extern const EnumNameEntry *GetEnumTable(ENUM, int &); + int length = 0; + auto table = GetEnumTable(ENUM(0), length); + return EnumToString(unsigned(value), table, length); +} + +static unsigned StringToEnumFlags(Lexer &lexer, const EnumNameEntry *table, int tableLength) +{ + unsigned value = 0; + std::string ident; + bool isXor = false; + while ( true ) + { + if ( !lexer.SkipSpace() ) + { + break; + } + if ( lexer.GetIdent(ident, false) ) + { + auto pos = std::find_if( + table, table + tableLength, [&ident](const EnumNameEntry &v) { return ident == v.name; }); + if ( pos != table + tableLength ) + { + value = isXor ? (value ^ pos->value) : (value | pos->value); + isXor = false; + } + } + if ( !lexer.SkipSpace() ) + { + break; + } + if ( lexer.Expect('^') ) + { + isXor = true; + } + else if ( !lexer.Expect('|') ) + { + break; + } + } + return value; +} + +template, int> = 0> +inline std::string format_as(const Flags &flags) noexcept +{ + return flags.ToString(); +} + +// Use to treat enumerations as flags when defined as single-bit +// numeric values: +// +// enum class Type +// { +// First=1, +// Second=2, +// Third=4 +// }; +// +// Wrap enumerations with Flags template and use as a bitset: +// +// Flags flags(Type::First, Type::Second); +// flags |= Type::Third; +// +// To convert to/from string, use macros to build a mapping table +// +// BEGIN_ENUM_TABLE(Type) +// ADD_ENUM_NAME(First) +// ADD_ENUM_NAME(Second) +// ADD_ENUM_NAME(Third) +// END_ENUM_TABLE() +// +// Then use flags.ToString() or flags.Parse() to convert between +// representations. + +#define BEGIN_ENUM_TABLE(TYPE) \ + const EnumNameEntry *GetEnumTable(TYPE disc, int &length); \ + const EnumNameEntry *GetEnumTable(TYPE disc, int &length) \ + { \ + (void)disc; \ + using ENUM_TYPE = TYPE; \ + static EnumNameEntry table[] = { +#define ADD_ENUM_NAME(ENUM) {unsigned(ENUM_TYPE::ENUM), #ENUM}, +#define END_ENUM_TABLE() \ + } \ + ; \ + length = int(std::size(table)); \ + return table; \ + } diff --git a/ethosu/regor/common/box.hpp b/ethosu/regor/common/box.hpp new file mode 100644 index 00000000..f4c957d1 --- /dev/null +++ b/ethosu/regor/common/box.hpp @@ -0,0 +1,63 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common.hpp" +#include "shape.hpp" + +#include +#include + +class Box +{ +private: + Shape _start; + Shape _end; + +public: + Box() = default; + + Box(const Shape &start, const Shape &end) : _start(start), _end(end) + { + assert(start.Size() == end.Size()); + assert(start <= end); + } + + Box(const Shape &end) : Box(end.WithZeros(), end) {} + + const Shape &Start() const { return _start; } + const Shape &End() const { return _end; } + + Shape SizeShape() const { return _end - _start; } + + bool Overlaps(const Box &other) const + { + int sz = _start.Size(); + for ( int i = 0; i < sz; ++i ) + { + if ( !::Overlaps(_start[i], _end[i], other._start[i], other._end[i]) ) + { + return false; + } + } + return true; + } + + std::string ToString() const { return fmt::format("[{} - {}]", _start.ToString(), _end.ToString()); } +}; diff --git a/ethosu/regor/common/buffer_view.hpp b/ethosu/regor/common/buffer_view.hpp new file mode 100644 index 00000000..3c65f0d9 --- /dev/null +++ b/ethosu/regor/common/buffer_view.hpp @@ -0,0 +1,445 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common.hpp" +#include "shape.hpp" + +#include +#include +#include + +namespace regor +{ + +/// +/// Buffer mechanism for local/remote data storage +/// +class Buffer : public std::enable_shared_from_this +{ + typedef void (*DeleteFunc)(void *); + +#define FOR_ALL_INT_TYPES(functor, sep) \ + functor(uint8_t) sep functor(uint16_t) \ + sep functor(uint32_t) \ + sep functor(uint64_t) \ + sep functor(int8_t) \ + sep functor(int16_t) \ + sep functor(int32_t) \ + sep functor(int64_t) + + union LocalStorage + { + LocalStorage() {} + ~LocalStorage() {} +#define TYPE_FUNC(x) std::vector as_##x + FOR_ALL_INT_TYPES(TYPE_FUNC, ;); +#undef TYPE_FUNC + }; + + template + struct IsSupportedIntegral + { +#define TYPE_FUNC(x) std::is_same::value + static constexpr bool value = FOR_ALL_INT_TYPES(TYPE_FUNC, ||); +#undef TYPE_FUNC + }; + + template + struct IsByte + { + static constexpr bool value = + std::is_same::value || std::is_same::value || std::is_same::value; + }; + + // TODO : make a proper type hash + template + struct TypeHash + { + static constexpr uint32_t value = (std::is_signed::value ? 1U << 16 : 0) | sizeof(TYPE); + }; + + union RefData + { + void *data; + const void *cdata; + }; + +private: + RefData _refData = {}; + int _sizeBytes = 0; + const uint32_t _typeHash; + const uint32_t _utypeHash; + bool _isLocal = false; + LocalStorage _localStorage; + DeleteFunc _deleter = nullptr; + +public: + Buffer(const Buffer &) = delete; + Buffer &operator=(const Buffer &) = delete; + + template::value, int> = 0> + Buffer(int sizeElements, const TYPE *buffer = nullptr, bool alias = false) : + _typeHash(TypeHash::value), _utypeHash(TypeHash>::value) + { + _sizeBytes = sizeof(TYPE) * sizeElements; + if ( buffer == nullptr || !alias ) + { + assert(sizeElements > 0); + auto ref = new TYPE[sizeElements]; + if ( buffer ) + { + std::copy_n(buffer, sizeElements, ref); + } + _refData.data = ref; + _deleter = &Buffer::DeleteArray; + } + else + { + assert(alias && buffer); + _refData.cdata = buffer; + } + } + + template::value, int> = 0> + Buffer(std::unique_ptr ptr) : + _typeHash(TypeHash::value), _utypeHash(TypeHash>::value) + { + _refData.data = ptr.release(); + _sizeBytes = sizeof(TYPE); + _deleter = &Buffer::Delete; + } + + template::value, int> = 0> + Buffer(std::unique_ptr ptr, int sizeElements) : + _typeHash(TypeHash::value), _utypeHash(TypeHash>::value) + { + _refData.data = ptr.release(); + assert(sizeElements > 0); + assert(INT_MAX / int(sizeof(TYPE)) >= sizeElements); + _sizeBytes = sizeof(TYPE) * sizeElements; + _deleter = &Buffer::DeleteArray; + } + + template::value, int> = 0> + Buffer(std::vector &&buffer) : + _typeHash(TypeHash::value), _utypeHash(TypeHash>::value) + { + new (&GetLocalVector()) std::vector(std::move(buffer)); + _deleter = &Buffer::DeleteVector; + _refData.data = &GetLocalVector(); + _isLocal = true; + } + + ~Buffer() + { + if ( _deleter ) + { + _deleter(_refData.data); + } + } + +public: + template + T *Data() + { + // Follow strict reinterpret_cast type aliasing rules + assert(IsByte::value || (TypeHash>::value == _utypeHash)); + if ( _isLocal ) + { + if constexpr ( IsByte::value ) + { + switch ( _typeHash ) + { +#define TYPE_FUNC(x) \ + case TypeHash::value: \ + return reinterpret_cast(GetLocalVector().data()) + FOR_ALL_INT_TYPES(TYPE_FUNC, ;); +#undef TYPE_FUNC + default: + assert(false); + return nullptr; + } + } + else + { + using S = std::make_signed_t; + using U = std::make_unsigned_t; + switch ( _typeHash ) + { + case TypeHash::value: + return reinterpret_cast(GetLocalVector().data()); + case TypeHash::value: + return reinterpret_cast(GetLocalVector().data()); + default: + assert(false); + return nullptr; + } + } + } + else + { + assert(_deleter); + return reinterpret_cast(_refData.data); + } + } + template + const T *Data() const + { + if ( _isLocal ) + { + // Follow strict reinterpret_cast type aliasing rules + assert(IsByte::value || (TypeHash>::value == _utypeHash)); + if constexpr ( IsByte::value ) + { + switch ( _typeHash ) + { +#define TYPE_FUNC(x) \ + case TypeHash::value: \ + return reinterpret_cast(GetLocalVector().data()) + FOR_ALL_INT_TYPES(TYPE_FUNC, ;); +#undef TYPE_FUNC + default: + assert(false); + return nullptr; + } + } + else + { + using S = std::make_signed_t; + using U = std::make_unsigned_t; + switch ( _typeHash ) + { + case TypeHash::value: + return reinterpret_cast(GetLocalVector().data()); + case TypeHash::value: + return reinterpret_cast(GetLocalVector().data()); + default: + assert(false); + return nullptr; + } + } + } + else + { + assert(uintptr_t(_deleter ? _refData.data : _refData.cdata) % alignof(T) == 0); + return reinterpret_cast(_deleter ? _refData.data : _refData.cdata); + } + } + + int Size() const + { + if ( _isLocal ) + { + switch ( _typeHash ) + { +#define TYPE_FUNC(x) \ + case TypeHash::value: \ + return int(GetLocalVector().size() * sizeof(x)) + FOR_ALL_INT_TYPES(TYPE_FUNC, ;); +#undef TYPE_FUNC + default: + assert(false); + return 0; + } + } + else + { + return _sizeBytes; + } + } + +private: + template + std::vector &GetLocalVector() + { + if constexpr ( false ) + { + } +#define TYPE_FUNC(x) else if constexpr ( std::is_same::value ) return _localStorage.as_##x + FOR_ALL_INT_TYPES(TYPE_FUNC, ;); +#undef TYPE_FUNC + else + { + static_assert(IsSupportedIntegral::value, ""); + return _localStorage.as_uint8_t; + } + } + template + const std::vector &GetLocalVector() const + { + if constexpr ( false ) + { + } +#define TYPE_FUNC(x) else if constexpr ( std::is_same::value ) return _localStorage.as_##x + FOR_ALL_INT_TYPES(TYPE_FUNC, ;); +#undef TYPE_FUNC + else + { + static_assert(IsSupportedIntegral::value, ""); + return _localStorage.as_uint8_t; + } + } + + template + static void Delete(void *p) + { + delete reinterpret_cast(p); + } + template + static void DeleteArray(void *p) + { + delete[] reinterpret_cast(p); + } + template + static void DeleteVector(void *v) + { + using vec = std::vector; + static_cast(v)->~vec(); + } +#undef FOR_ALL_INT_TYPES +}; + + +/// +/// Access proxy for processing values within a buffer +/// +template +class BufferValues +{ + using PTR_TYPE = typename std::conditional_t; + +private: + PTR_TYPE _data; + Shape _strideBytes; + +public: + BufferValues(PTR_TYPE data, const Shape &strideBytes) : _data(data), _strideBytes(strideBytes) {} + + template = 0> + TYPE &operator[](int index) + { + return _data[index]; + } + + const TYPE &operator[](int index) const { return _data[index]; } + + int ElementIndex(const Shape &offset) const + { + int index = offset.Dot(_strideBytes) / sizeof(TYPE); + return index; + } +}; + + +/// +/// View of buffer memory +/// +class BufferView +{ +protected: + std::shared_ptr _buffer; + int _elementBits = 0; + int _baseOffset = 0; + Shape _axisElements; + Shape _strideBytes; + +public: + BufferView() {} + + BufferView(const std::shared_ptr &buffer, int firstElement, int elementBits, const Shape &axisElements, const Shape &strideBytes) + { + assert(elementBits >= 8 && elementBits % 8 == 0); + _buffer = buffer; + _elementBits = elementBits; + _baseOffset = firstElement; + _axisElements = axisElements; + if ( strideBytes.IsEmpty() ) + { + // Calculate byte strides + int sz = axisElements.Size(); + if ( sz > 0 ) + { + std::vector strides(sz); + int v = 1; + for ( int i = sz - 1; i >= 0; --i ) + { + strides[i] = (v * elementBits) / 8; + v *= axisElements[i]; + } + + _strideBytes = Shape(&strides[0], sz); + } + } + else + { + _strideBytes = strideBytes; + } + } + + BufferView(const std::shared_ptr &buffer, const BufferView &other) + { + _buffer = buffer; + _elementBits = other._elementBits; + _baseOffset = 0; + _axisElements = other._axisElements; + _strideBytes = other._strideBytes; + } + +public: + bool HasBuffer() const { return _buffer != nullptr; } + const Shape &ViewShape() const { return _axisElements; } + const Shape &StrideBytes() const { return _strideBytes; } + + BufferView Reshape(const Shape &size) const + { + assert(size.Elements() == _axisElements.Elements()); + return BufferView(_buffer, 0, _elementBits, size, Shape()); + } + + BufferView SubView(const Shape &offset, const Shape &size) const + { + assert(size.Elements() < _axisElements.Elements()); + int linearOffset = offset.Dot(_strideBytes); + return BufferView(_buffer, linearOffset, _elementBits, size, _strideBytes); + } + + template + BufferValues Values() const + { + assert(HasBuffer()); + auto start = const_cast(_buffer.get())->Data() + _baseOffset; + return BufferValues(start, _strideBytes); + } + + template + BufferValues WritableValues() + { + assert(HasBuffer()); + auto start = _buffer->Data() + _baseOffset; + return BufferValues(start, _strideBytes); + } + + int BufferSize() const { return _buffer->Size(); } + + const class Buffer *Buffer() const { return _buffer.get(); } +}; + + +} // namespace regor diff --git a/ethosu/regor/common/common.cpp b/ethosu/regor/common/common.cpp new file mode 100644 index 00000000..c4a6f61a --- /dev/null +++ b/ethosu/regor/common/common.cpp @@ -0,0 +1,29 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common.hpp" + +#include + +UniqueId GenerateUniqueId() +{ + static std::atomic _id; + UniqueId id = _id.fetch_add(1); + assert(id != std::numeric_limits::max()); + return id; +} diff --git a/ethosu/regor/common/common.hpp b/ethosu/regor/common/common.hpp new file mode 100644 index 00000000..af33a901 --- /dev/null +++ b/ethosu/regor/common/common.hpp @@ -0,0 +1,172 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +// Macro to mark variables as unused +#if !defined UNUSED +#define UNUSED(x) (void)(x) +#endif + +#define MACRO_EXPAND(a) a +#define MACRO_CONCAT_IMPL(a, b) a##b +#define MACRO_CONCAT(a, b) MACRO_CONCAT_IMPL(a, b) + +// clang-format off +#if __GNUC__ + #define DLL_EXPORT __attribute__((visibility("default"))) + #define _strnicmp strncasecmp +#elif _WIN32 + #if TARGET_WIN32_DLL + #define DLL_EXPORT __declspec(dllexport) + #else + #define DLL_EXPORT + #endif + #ifndef ssize_t + #define ssize_t ptrdiff_t + #endif +#else + #error "undefined export semantics" +#endif +// clang-format on + +#include +#include +#include +#include +#include +#include + +template, int> = 0> +constexpr std::underlying_type_t format_as(ENUMTYPE e) noexcept +{ + return static_cast>(e); +} + +namespace regor +{ +template, int> = 0> +constexpr std::underlying_type_t format_as(ENUMTYPE e) noexcept +{ + return static_cast>(e); +} +} // namespace regor + +#define DECLARE_ENUM_AS_FLAGS(ENUM_) \ + static constexpr inline ENUM_ operator&(ENUM_ a, ENUM_ b) \ + { \ + return ENUM_(std::underlying_type::type(a) & std::underlying_type::type(b)); \ + } \ + static constexpr inline ENUM_ operator|(ENUM_ a, ENUM_ b) \ + { \ + return ENUM_(std::underlying_type::type(a) | std::underlying_type::type(b)); \ + } \ + static constexpr inline ENUM_ operator^(ENUM_ a, ENUM_ b) \ + { \ + return ENUM_(std::underlying_type::type(a) ^ std::underlying_type::type(b)); \ + } + +using UniqueId = uint32_t; + +UniqueId GenerateUniqueId(); + +#define VERIFY(x_) (assert(x_), (x_)) + +namespace regor +{ + +#if defined __GNUC__ + +template +constexpr const char *PlatformRootName() +{ + const char *p = __PRETTY_FUNCTION__; + return p; +} + +template +constexpr std::string_view PlatformTypeName() +{ + const char *p = PlatformRootName(); + while ( *p++ != '=' ) + { + }; + while ( *p == ' ' ) + { + p++; + }; + std::size_t i = 0; + while ( (p[i] != 0) && (p[i] != ']') && (*p != ';') ) + i++; + return std::string_view(p, i); +} + +#elif _MSC_VER + +template +constexpr const char *PlatformRootName() +{ + const char *p = __FUNCSIG__; + return p; +} + +template +constexpr std::string_view PlatformTypeName() +{ + const char *p = PlatformRootName(); + while ( *p++ != '<' ) + ; + if ( (*p == 'c') || (*p == 'u') || (*p == 's') || (*p == 'e') ) + { + if ( std::string_view(p, 7) == "struct " ) p += 7; + else if ( std::string_view(p, 6) == "class " ) p += 6; + else if ( std::string_view(p, 6) == "union " ) p += 6; + else if ( std::string_view(p, 5) == "enum " ) p += 5; + } + std::size_t i = 0; + while ( p[i] != '(' ) + i++; + return std::string_view(p, i - 1); +} + +#else +#error No type hash for this target +#endif + +template +static constexpr uint32_t PlatformTypeHash() +{ + const std::string_view name = PlatformTypeName(); + auto p = name.begin(); + auto e = name.end(); + if constexpr ( NO_NAMESPACE ) + { + while ( *p != ':' ) + p++; + while ( *p == ':' ) + p++; + } + uint32_t hash = 0x811c9dc5; // FNV-1a SEED + while ( p < e ) + { + hash = (hash ^ uint8_t(*p++)) * 0x01000193; // FNV-1a PRIME + } + return hash; +} + +} // namespace regor diff --git a/ethosu/regor/common/data_type.cpp b/ethosu/regor/common/data_type.cpp new file mode 100644 index 00000000..d9c8ac21 --- /dev/null +++ b/ethosu/regor/common/data_type.cpp @@ -0,0 +1,72 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common/data_type.hpp" + +#include "common/logging.hpp" + +#include "common/bit_flags.hpp" + +BEGIN_ENUM_TABLE(regor::DataType) + ADD_ENUM_NAME(None) + ADD_ENUM_NAME(Bits4) + ADD_ENUM_NAME(Bits8) + ADD_ENUM_NAME(Bits16) + ADD_ENUM_NAME(Bits32) + ADD_ENUM_NAME(Bits64) + ADD_ENUM_NAME(Bits128) + ADD_ENUM_NAME(Signed) + ADD_ENUM_NAME(Asymmetric) + ADD_ENUM_NAME(Int) + ADD_ENUM_NAME(SignedInt) + ADD_ENUM_NAME(Int4) + ADD_ENUM_NAME(Int8) + ADD_ENUM_NAME(Int16) + ADD_ENUM_NAME(Int32) + ADD_ENUM_NAME(Int48) + ADD_ENUM_NAME(Int64) + ADD_ENUM_NAME(UInt8) + ADD_ENUM_NAME(UInt16) + ADD_ENUM_NAME(UInt32) + ADD_ENUM_NAME(UInt64) + ADD_ENUM_NAME(QInt) + ADD_ENUM_NAME(QInt4) + ADD_ENUM_NAME(QInt8) + ADD_ENUM_NAME(QInt12) + ADD_ENUM_NAME(QInt16) + ADD_ENUM_NAME(QInt32) + ADD_ENUM_NAME(QUInt) + ADD_ENUM_NAME(QUInt4) + ADD_ENUM_NAME(QUInt8) + ADD_ENUM_NAME(QUInt12) + ADD_ENUM_NAME(QUInt16) + ADD_ENUM_NAME(QUInt32) + ADD_ENUM_NAME(Float) + ADD_ENUM_NAME(Float16) + ADD_ENUM_NAME(Float32) + ADD_ENUM_NAME(Float64) + ADD_ENUM_NAME(Bool) + ADD_ENUM_NAME(Bool8) + ADD_ENUM_NAME(Complex) + ADD_ENUM_NAME(Complex64) + ADD_ENUM_NAME(Complex128) + ADD_ENUM_NAME(VariablySized) + ADD_ENUM_NAME(String) + ADD_ENUM_NAME(Resource) + ADD_ENUM_NAME(Variant) +END_ENUM_TABLE() diff --git a/ethosu/regor/common/data_type.hpp b/ethosu/regor/common/data_type.hpp new file mode 100644 index 00000000..06c79c2c --- /dev/null +++ b/ethosu/regor/common/data_type.hpp @@ -0,0 +1,290 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once +#include "common/bit_flags.hpp" + +#include +#include +#include + +namespace regor +{ +class int48_t +{ +public: + int48_t() = default; + int48_t(const int48_t &obj) = default; + int48_t(int48_t &&obj) noexcept = default; + int48_t(const int64_t val) + { + for ( int i = 0; i < 6; i++ ) + { + _data[i] = (val & (uint64_t(0xFF) << i * 8)) >> (i * 8); + } + } + + int48_t(const uint64_t val) { int48_t(static_cast(val)); } + + operator int64_t() const + { + int64_t res = 0; + for ( int i = 0; i < 6; i++ ) + { + res |= uint64_t(_data[i]) << (16 + i * 8); + } + + return res >> 16; + } + + operator uint64_t() const { return static_cast(operator int64_t()); } + +private: + uint8_t _data[6]{0}; +}; + +enum class DataType : uint16_t +{ + None = 0, + // Bits 1 and 2 reserved for disambiguating variably sized types + Bits4 = 1 << 2, + Bits8 = 1 << 3, + Bits16 = 1 << 4, + Bits32 = 1 << 5, + Bits48 = Bits32 | Bits16, + Bits64 = 1 << 6, + Bits128 = 1 << 7, + Signed = 1 << 8, + Packed = 1 << 9, + Asymmetric = 1 << 10, + Int = 1 << 11, + SignedInt = Signed | Int, + Int4 = SignedInt | Bits4, + Int4Packed8 = SignedInt | Bits8 | Bits4 | Packed, + Int8 = SignedInt | Bits8, + Int16 = SignedInt | Bits16, + Int32 = SignedInt | Bits32, + Int48 = SignedInt | Bits48, + Int64 = SignedInt | Bits64, + UInt8 = Int | Bits8, + UInt16 = Int | Bits16, + UInt32 = Int | Bits32, + UInt48 = Int | Bits48, + UInt64 = Int | Bits64, + QInt = Asymmetric | SignedInt, + QInt4 = QInt | Bits4, + QInt8 = QInt | Bits8, + QInt12 = QInt | Bits8 | Bits4, + QInt16 = QInt | Bits16, + QInt32 = QInt | Bits32, + QUInt = Asymmetric, + QUInt4 = QUInt | Bits4, + QUInt8 = QUInt | Bits8, + QUInt12 = QUInt | Bits8 | Bits4, + QUInt16 = QUInt | Bits16, + QUInt32 = QUInt | Bits32, + Float = 1 << 12, + BFloat16 = Float | Bits16 | Packed, + Float16 = Float | Bits16, + Float32 = Float | Bits32, + Float64 = Float | Bits64, + Bool = 1 << 13, + Bool8 = Bool | Bits8, + Complex = 1 << 14, + Complex64 = Complex | Bits64, + Complex128 = Complex | Bits128, + VariablySized = 1 << 15, + String = VariablySized | 1, + Resource = VariablySized | 2, + Variant = VariablySized | 3, +}; + +inline constexpr DataType operator&(DataType type, DataType mask) +{ + return DataType(unsigned(type) & unsigned(mask)); +} +inline constexpr DataType operator&(DataType type, unsigned mask) +{ + return DataType(unsigned(type) & mask); +} +inline constexpr DataType operator|(DataType type, DataType mask) +{ + return DataType(unsigned(type) | unsigned(mask)); +} +inline constexpr DataType operator|(DataType type, unsigned mask) +{ + return DataType(unsigned(type) | mask); +} +inline constexpr bool operator!(DataType type) +{ + return type == DataType::None; +} + + +static inline int Clz(uint32_t value) +{ + // Ensure all CLZ implementations return '32 zeroes' + // for a zero value input. + if ( value == 0 ) + { + return 32; + } +#if defined(__GNUC__) + return __builtin_clz(value); +#elif defined(_MSC_VER) + unsigned long index; + _BitScanReverse(&index, value); + return int(31 - index); +#else +#error "Missing platform CLZ32 implementation" +#endif +} + +inline constexpr int DataTypeStorageSizeBits(DataType type) +{ + unsigned bits = unsigned(type & 0x00FFu); + // Interpret packed word size as the largest set bit + if ( (type & DataType::Packed) == DataType::Packed ) + { + assert(bits > 0); + return 1 << (31 - Clz(bits)); + } + return (!(type & DataType::VariablySized) ? std::max(int(bits), 8) : -1); +} + +inline constexpr int DataTypeSizeBits(DataType type) +{ + unsigned bits = unsigned(type & 0x00FFu); + if ( (type & DataType::Packed) == DataType::Packed ) + { + assert(bits > 0); + bits ^= 1 << (31 - Clz(bits)); // Strip container word + } + return (!(type & DataType::VariablySized) ? int(bits) : -1); +} + +inline constexpr int DataTypeStorageSizeBytes(DataType type, int elements) +{ + const int storageBits = DataTypeStorageSizeBits(type); + const int bits = (type & DataType::Packed) == DataType::Packed ? DataTypeSizeBits(type) : storageBits; + assert(storageBits >= 8); + return (((elements * bits) + storageBits - 1) / storageBits) * (storageBits / 8); +} + + +inline constexpr int DataTypeElements(DataType type, int size) +{ + const int bits = (type & DataType::Packed) == DataType::Packed ? DataTypeSizeBits(type) : DataTypeStorageSizeBits(type); + assert(size <= std::numeric_limits::max() / 8); + return 8 * size / bits; +} + +inline constexpr DataType DataTypeBase(DataType type) +{ + return (!(type & DataType::VariablySized) ? type & 0xFF00u : type); +} + +inline std::string DataTypeToString(const DataType type) +{ + return EnumToString(type); +} + +inline constexpr bool IsInteger(DataType type) +{ + return (type & DataType::Int) == DataType::Int; +} + +inline constexpr bool IsSignedInteger(DataType type) +{ + return (type & DataType::SignedInt) == DataType::SignedInt; +} + +inline constexpr bool IsFloat(DataType type) +{ + return (type & DataType::Float) == DataType::Float; +} + +inline constexpr bool IsBool(DataType type) +{ + return (type & DataType::Bool) == DataType::Bool; +} + +inline constexpr uint64_t IntegerMax(DataType type) +{ + assert(IsInteger(type)); + return ~0ULL >> (64 - DataTypeSizeBits(type) + int(IsSignedInteger(type))); +} + +inline constexpr int64_t IntegerMin(DataType type) +{ + assert(IsInteger(type)); + if ( IsSignedInteger(type) ) + { + int size = DataTypeSizeBits(type); + return -(1LL << (size - 1)); + } + return 0; +} + +template +struct DataTypeOf +{ + static constexpr DataType value = DataType::None; +}; +template<> +struct DataTypeOf +{ + static constexpr DataType value = DataType::Int8; +}; +template<> +struct DataTypeOf +{ + static constexpr DataType value = DataType::Int16; +}; +template<> +struct DataTypeOf +{ + static constexpr DataType value = DataType::Int32; +}; +template<> +struct DataTypeOf +{ + static constexpr DataType value = DataType::Int64; +}; +template<> +struct DataTypeOf +{ + static constexpr DataType value = DataType::UInt8; +}; +template<> +struct DataTypeOf +{ + static constexpr DataType value = DataType::UInt16; +}; +template<> +struct DataTypeOf +{ + static constexpr DataType value = DataType::UInt32; +}; +template<> +struct DataTypeOf +{ + static constexpr DataType value = DataType::UInt64; +}; + +} // namespace regor diff --git a/ethosu/regor/common/dynamic_typing.hpp b/ethosu/regor/common/dynamic_typing.hpp new file mode 100644 index 00000000..236f8f2a --- /dev/null +++ b/ethosu/regor/common/dynamic_typing.hpp @@ -0,0 +1,255 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/shape.hpp" + +#include + +namespace regor +{ + +template +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 0; +}; +// Ordinal types +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 1; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 3; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 4; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 5; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 6; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 7; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 8; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 9; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 10; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 11; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 12; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 13; +}; +// Class types +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 0x1F; +}; +template<> +struct FieldTypeId +{ + static constexpr uint8_t TYPEID = 0x2F; +}; + +template +struct TypeHash +{ + static constexpr uint32_t HASH = PlatformTypeHash(); +}; + +struct FieldInfo +{ + size_t offset = 0; + uint32_t id = 0; + uint8_t typeId = 0; +}; + +struct TypeInfo +{ + uint32_t _hash; + const FieldInfo *_fields; + size_t _fieldCount; + void (*_deleter)(void *); + void (*_addref)(void *); + +public: + uint32_t Hash() const { return _hash; } + void *AddRef(void *p) const + { + if ( !_addref ) return nullptr; + _addref(p); + return p; + } + void Delete(void *p) const { _deleter(p); } + const FieldInfo *Fields(size_t &length) const + { + length = _fieldCount; + return _fields; + } +}; + +// Dynamic allocation with type-erasure. Use to handle anonymous TYPE +// allocations by passing around a void pointer and the type information +// separately at runtime. +template +struct TypeInfoOf +{ + static void *DefaultNew() { return new TYPE(); } + static void DefaultDeleter(void *p) { delete static_cast(p); } + + struct SharedType + { + TYPE instance; + unsigned ref = 1; + }; + static void *SharedNew() + { + auto *p = new SharedType(); + assert(static_cast(p) == static_cast(&p->instance)); + return &p->instance; + } + static void SharedAddRef(void *p) + { + assert(p); + static_cast(p)->ref++; + } + static void SharedDeleter(void *p) + { + auto *shared = static_cast(p); + if ( --shared->ref == 0 ) delete shared; + } + + static const TypeInfo *Get(bool sharedInstancing) + { + size_t len; + const FieldInfo *f = TYPE::FieldTable(len); + static const TypeInfo s_infoDefault{PlatformTypeHash(), f, len, &DefaultDeleter, nullptr}; + static const TypeInfo s_infoShared{PlatformTypeHash(), f, len, &SharedDeleter, &SharedAddRef}; + return sharedInstancing ? &s_infoShared : &s_infoDefault; + } +}; + + +// Container for dynamically typed instances +struct DynamicRef +{ +private: + const TypeInfo *_info = nullptr; + void *_instance = nullptr; + +public: + DynamicRef() = default; + DynamicRef(const TypeInfo *info, void *inst) : _info(info), _instance(inst) {} + DynamicRef(const DynamicRef &other) { *this = other; } + DynamicRef(DynamicRef &&other) noexcept { *this = std::move(other); } + ~DynamicRef() + { + if ( _instance ) + { + assert(_info); + _info->Delete(_instance); + } + } + + DynamicRef &operator=(const DynamicRef &other) + { + if ( _instance ) + { + assert(_info); + _info->Delete(_instance); + _instance = nullptr; + } + if ( &other != this && other._instance ) + { + _info = other._info; + _instance = _info->AddRef(other._instance); + assert(_instance); + } + return *this; + } + + DynamicRef &operator=(DynamicRef &&other) noexcept + { + if ( &other != this ) + { + _info = other._info; + _instance = other._instance; + other._instance = nullptr; + } + return *this; + } + operator bool() const { return _info && _instance; } + void *Instance() { return _instance; } + const TypeInfo *Info() const { return _info; } +}; + + +// clang-format off + +#define BEGIN_FIELD_TABLE(CLASS_) \ + static const FieldInfo *FieldTable(size_t &len) { \ + typedef CLASS_ thisclass_t; \ + static const FieldInfo s_fieldTable[] = { + +#define END_FIELD_TABLE() }; \ + len = std::size(s_fieldTable); \ + return s_fieldTable; } + +// clang-format on + +#define REGOR_FIELD_TYPE(TYPE_) FieldTypeId::type>::TYPEID + + +} // namespace regor diff --git a/ethosu/regor/common/ini_reader.hpp b/ethosu/regor/common/ini_reader.hpp new file mode 100644 index 00000000..5a079900 --- /dev/null +++ b/ethosu/regor/common/ini_reader.hpp @@ -0,0 +1,300 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common.hpp" +#include "lexer.hpp" + +#include +#include +#include + + +/// +/// INI file reader +/// +class IniReader : Lexer +{ + enum class ParseState + { + None, + Section, + Key, + Value + }; + +protected: + ParseState _parseState = ParseState::None; + bool _wasError = false; + +public: + IniReader(const char *src, size_t length) : Lexer(src, length) {} + + IniReader(const IniReader &other) = delete; + +public: + bool Begin(std::string &key) + { + assert(_parseState != ParseState::Value); + + if ( _parseState == ParseState::None ) + { + if ( !SkipCommentSpace() ) + { + return false; + } + + // Expect section + if ( !Expect('[') ) + { + // Error state + _wasError = true; + return false; + } + _parseState = ParseState::Section; + } + + if ( _parseState == ParseState::Section ) + { + // Get section identifier + if ( !GetString(key, 0, 0, ']') ) + { + return false; + } + if ( !SkipWhite() ) + { + return false; + } + if ( !Expect(']') ) + { + // Error state + _wasError = true; + return false; + } + _parseState = ParseState::Key; + } + else if ( _parseState == ParseState::Key ) + { + if ( !SkipCommentSpace() ) + { + return false; + } + // Get key name + if ( !GetIdent(key) ) + { + _parseState = ParseState::None; + return false; + } + while ( Expect('.') ) + { + key += "."; + std::string tmp; + if ( GetIdent(tmp) ) + { + key += tmp; + } + } + if ( !SkipWhite() ) + { + return false; + } + if ( !Expect('=') ) + { + // Error - attempt to recover by jumping to next line + _wasError = true; + SkipUntil('\n', true); + return false; + } + _parseState = ParseState::Value; + } + + return true; + } + + void End() + { + if ( _parseState == ParseState::Value ) // End key/value pair + { + // End this line (doesn't work with quoted strings that span lines) + SkipUntil('\n', true); + _parseState = ParseState::Key; + } + else if ( _parseState == ParseState::Key ) // End section + { + // Skip until next section or EOF + std::string skipped; + while ( Begin(skipped) ) + { + assert(_parseState != ParseState::Key); // Prevent recursion depth > 2 + End(); + } + _parseState = ParseState::None; + } + } + + bool Read(bool &value) + { + if ( !SkipWhite() ) + { + return false; + } + + assert(_parseState == ParseState::Value); + + if ( Expect("true") ) value = true; + else if ( Expect("yes") ) value = true; + else if ( Expect("1") ) value = true; + else if ( Expect("false") ) value = false; + else if ( Expect("no") ) value = false; + else if ( Expect("0") ) value = false; + else return false; + + return true; + } + + bool Read(int64_t &value) + { + if ( !SkipSpace() ) + { + return false; + } + + assert(_parseState == ParseState::Value); + + auto [ptr, ec] = std::from_chars(_pos, _end, value); + if ( _pos == ptr ) + { + return false; + } + _pos = ptr; + + // Allow comma-separated value reads + SkipSpace(); + Expect(','); + return true; + } + + bool Read(int &value) + { + if ( !SkipSpace() ) + { + return false; + } + + assert(_parseState == ParseState::Value); + + auto [ptr, ec] = std::from_chars(_pos, _end, value); + if ( _pos == ptr ) + { + return false; + } + _pos = ptr; + + // Allow comma-separated value reads + SkipSpace(); + Expect(','); + return true; + } + + bool Read(float &value) + { + if ( !SkipSpace() ) + { + return false; + } + + assert(_parseState == ParseState::Value); + + char *end; + value = std::strtof(_pos, &end); + if ( end == _pos ) + { + return false; + } + _pos = end; + + // Allow comma-separated value reads + SkipSpace(); + Expect(','); + return true; + } + + bool Read(std::string &value) + { + if ( !SkipSpace() ) + { + return false; + } + + assert(_parseState == ParseState::Value); + if ( GetString(value, '"', '\\', '\n') ) + { + if ( !value.empty() && value.back() == '\r' ) + { + value.pop_back(); + } + } + return true; + } + + template + bool Read(std::vector &out) + { + assert(_parseState == ParseState::Value); + + TYPE tmp{}; + while ( Read(tmp) ) + { + out.push_back(tmp); + } + return !out.empty(); + } + + template + TYPE Get() + { + TYPE tmp = TYPE(); + Read(tmp); + return tmp; + } + + ssize_t Position() const { return _pos - _source; } + +private: + bool SkipCommentSpace() + { + // Skip whitespace and comments + while ( true ) + { + if ( !SkipWhite() ) + { + return false; + } + if ( !Expect(';') ) + { + break; + } + if ( !SkipUntil('\n', true) ) + { + return false; + } + } + return true; + } +}; diff --git a/ethosu/regor/common/lexer.hpp b/ethosu/regor/common/lexer.hpp new file mode 100644 index 00000000..a6373f1d --- /dev/null +++ b/ethosu/regor/common/lexer.hpp @@ -0,0 +1,159 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include +#include +#include +#include +#include + + +/// +/// Text string lexer +/// +class Lexer +{ +protected: + const char *_source = nullptr; + const char *_end = nullptr; + const char *_pos = nullptr; + +public: + Lexer(const char *text, size_t length) : _source(text), _end(text + length) { _pos = _source; } + + bool SkipSpace() + { + const char *p = _pos; + while ( p < _end && std::isblank(*p) ) + { + p++; + } + _pos = p; + return p < _end; + } + + bool SkipWhite() + { + const char *p = _pos; + while ( p < _end && std::isspace(*p) ) + { + p++; + } + _pos = p; + return p < _end; + } + + bool SkipUntil(char term, bool consume) + { + const char *p = _pos; + while ( (p < _end) && (*p != term) ) + { + p++; + } + if ( consume && (*p == term) ) + { + p++; + } + _pos = p; + return p < _end; + } + + char Peek() const + { + assert(_pos < _end); + return _pos < _end ? *_pos : '\0'; + } + + bool Expect(char c) + { + assert(_pos < _end); + if ( *_pos == c ) + { + _pos++; + return true; + } + return false; + } + + bool Expect(const char *text, size_t length = 0) + { + ptrdiff_t avail = std::max(0, _end - _pos); + size_t compare = length == 0 ? std::char_traits::length(text) : length; + compare = std::min(compare, size_t(avail)); + if ( strncmp(_pos, text, compare) == 0 ) + { + _pos += compare; + return true; + } + return false; + } + + bool GetIdent(std::string &ident, bool skipwhite = true) + { + if ( skipwhite && !SkipWhite() ) + { + return false; + } + + const char *p = _pos; + if ( *p != '_' && !std::isalpha(*p) ) + { + return false; + } + + const char *maxEnd = _end; + while ( (p < maxEnd) && (*p == '_' || std::isalnum(*p)) ) + { + p++; + } + + ident.assign(_pos, p); + _pos = p; + return !ident.empty(); + } + + bool GetString(std::string &text, char quote, char escape, char term) + { + text.reserve(16); + text.clear(); + + bool quoted = Expect(quote); + + const char *p = _pos; + while ( (p < _end) && (*p != term) ) + { + // Handle escaping first to allow quotes and terminator + // in string. + if ( (*p == escape) && (p < _end) ) + { + p++; + } + else if ( quoted ) + { + if ( *p == quote ) break; + } + text += *p; + p++; + } + + _pos = p; + return quoted || !text.empty(); // Quoted strings are intentionally present (just empty) + } +}; diff --git a/ethosu/regor/common/logging.cpp b/ethosu/regor/common/logging.cpp new file mode 100644 index 00000000..d33bd48c --- /dev/null +++ b/ethosu/regor/common/logging.cpp @@ -0,0 +1,70 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "logging.hpp" + +namespace Logging +{ + +LogContext Out("", ~0u); + +static const char g_indentString[] = "\t\t\t\t\t\t\t\t\t\t\t\t"; + +LogContext::LogContext(const char *prefix, uint32_t filterMask) : _prefix(prefix), _filterMask(filterMask) +{ +} + +LogContext::~LogContext() +{ +} + +void LogContext::Indent() +{ + assert(_indent < int(sizeof(g_indentString))); + _indent++; +} + +void LogContext::Unindent() +{ + assert(_indent > 0); + _indent--; +} + +void LogContext::Write(const std::string &s) +{ + assert(_logWriter); + + if ( !_prefix.empty() ) + { + _logWriter(_prefix.data(), _prefix.size()); + } + if ( _indent != 0 ) + { + _logWriter(g_indentString, _indent); + } + + _logWriter(s.data(), s.size()); +} + +void LogContext::WriteLn(const std::string &s) +{ + Write(s); + _logWriter("\n", 1); +} + +} // namespace Logging diff --git a/ethosu/regor/common/logging.hpp b/ethosu/regor/common/logging.hpp new file mode 100644 index 00000000..fb71dec2 --- /dev/null +++ b/ethosu/regor/common/logging.hpp @@ -0,0 +1,200 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common.hpp" + +#include +#include +#include + +// Log tracing detail settings, set LOG_TRACE_ENABLE to +// TD_0|TD_1|TD_2 to enable those traces before including +// this logging header. +#define TD_0 (1) +#define TD_1 (2) +#define TD_2 (4) + +#if !defined LOG_TRACE_ENABLE +#define LOG_TRACE_ENABLE (TD_0) +#endif + +#define LOG_TRACE0_ON (((LOG_TRACE_ENABLE)&TD_0) != 0) +#define LOG_TRACE0(...) \ + { \ + if ( LOG_TRACE0_ON ) \ + { \ + Logging::Out(8)(__VA_ARGS__); \ + } \ + } + +#define LOG_TRACE1_ON (((LOG_TRACE_ENABLE)&TD_1) != 0) +#define LOG_TRACE1(...) \ + { \ + if ( LOG_TRACE1_ON ) \ + { \ + Logging::Out(16)(__VA_ARGS__); \ + } \ + } + +#define LOG_TRACE2_ON (((LOG_TRACE_ENABLE)&TD_2) != 0) +#define LOG_TRACE2(...) \ + { \ + if ( LOG_TRACE2_ON ) \ + { \ + Logging::Out(32)(__VA_ARGS__); \ + } \ + } + +#if NDEBUG +#define LOG_DEBUG(...) \ + do \ + { \ + } while ( false ) +#else +#define LOG_DEBUG(...) Logging::Out(~0u)(__VA_ARGS__) +#endif + +#define LOG_PRINT(...) \ + { \ + Logging::Out(1)(__VA_ARGS__); \ + } +#define LOG_WARN(...) \ + { \ + Logging::Out(1)(__VA_ARGS__); \ + } +#define LOG_ERROR(...) \ + { \ + Logging::Out(2)(__VA_ARGS__); \ + } + + +extern "C" { +typedef void (*log_writer_t)(const void *data, size_t length); +} + +namespace Logging +{ + +/// +/// Logging context, currently outputs direct to stdout +/// +class LogContext +{ +private: + std::string _prefix; + unsigned _filterMask = ~0u; + int _indent = 0; + log_writer_t _logWriter = nullptr; + +public: + struct Filter + { + public: + LogContext *_context; + unsigned _mask; + + public: + Filter(LogContext *ctx, unsigned mask) : _context(ctx), _mask(mask) {} + + template + void operator()(const char *format, TYPES... args) const + { + if ( _mask & _context->_filterMask ) + { + _context->Write(fmt::format(format, std::forward(args)...)); + } + } + + template + void operator()(const std::string &format, TYPES... args) const + { + if ( _mask & _context->_filterMask ) + { + _context->Write(fmt::format(format.c_str(), std::forward(args)...)); + } + } + + template + void Print(const char *format, TYPES... args) const + { + if ( _mask & _context->_filterMask ) + { + _context->Write(fmt::format(format, std::forward(args)...)); + } + } + + template + void Print(const std::string &format, TYPES... args) const + { + if ( _mask & _context->_filterMask ) + { + _context->Write(fmt::format(format, std::forward(args)...)); + } + } + }; + +public: + LogContext(const char *prefix, unsigned filterMask); + ~LogContext(); + +public: + void SetPrefix(const char *prefix) { _prefix = prefix; } + void SetFilterMask(unsigned mask) { _filterMask = mask; } + unsigned FilterMask() const { return _filterMask; } + void SetWriter(log_writer_t writer) { _logWriter = writer; } + void Indent(); + void Unindent(); + void Write(const std::string &s); + void WriteLn(const std::string &s); + + template + void Print(const char *format, TYPES... args) + { + if ( _filterMask != 0 ) + { + Write(fmt::format(format, std::forward(args)...)); + } + } + + template + void Print(const std::string &format, TYPES... args) + { + if ( _filterMask != 0 ) + { + Write(fmt::format(format, std::forward(args)...)); + } + } + + Filter operator()(unsigned mask) { return Filter(this, mask); } +}; + +// Default output stream +extern LogContext Out; + +struct LogIndenter +{ + LogContext &_ctx; + LogIndenter(Logging::LogContext &ctx) : _ctx(ctx) { _ctx.Indent(); } + ~LogIndenter() { _ctx.Unindent(); } +}; + +#define LOG_INDENT(ctx) Logging::LogIndenter MACRO_CONCAT(_indent, __LINE__)(ctx) + +} // namespace Logging diff --git a/ethosu/regor/common/numeric_util.hpp b/ethosu/regor/common/numeric_util.hpp new file mode 100644 index 00000000..974db467 --- /dev/null +++ b/ethosu/regor/common/numeric_util.hpp @@ -0,0 +1,412 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#if _MSC_VER +#include +#endif + +template +bool CheckSafeAdd(T a, T b) +{ +#ifdef __GNUC__ + T res; + return !__builtin_add_overflow(a, b, &res); +#else + return (!( + (a > 0 && b > std::numeric_limits::max() - a) || + ((std::is_signed::value && (a < 0 && b < std::numeric_limits::min() - a))))); +#endif +} + +template +bool CheckSafeSub(T a, T b) +{ +#ifdef __GNUC__ + T res; + return !__builtin_sub_overflow(a, b, &res); +#else + return !( + (a > 0 && b < std::numeric_limits::min() + a) || + (std::is_signed::value && (a < 0 && b > std::numeric_limits::max() + a)) || (std::is_unsigned::value && a < b)); +#endif +} + +template +bool CheckSafeMul(T a, T b) +{ +#ifdef __GNUC__ + T res; + return !__builtin_mul_overflow(a, b, &res); +#else + return !( + (std::max(a, b) == -1 && std::min(a, b) == std::numeric_limits::min()) || + (b != 0 && (a > std::numeric_limits::max() / b || (std::is_unsigned::value && (a < std::numeric_limits::min() / b))))); +#endif +} + +template +T SafeAdd(T a, T b) +{ + if ( !CheckSafeAdd(a, b) ) throw std::overflow_error("Addition overflow"); + return a + b; +} + +template +T SafeSub(T a, T b) +{ + if ( !CheckSafeSub(a, b) ) throw std::overflow_error("Subtraction overflow"); + return a - b; +} + +template +T SafeMul(T a, T b) +{ + if ( !CheckSafeMul(a, b) ) throw std::overflow_error("Multiplication overflow"); + return a * b; +} + +template +T AssertAdd(T a, T b) +{ + assert(CheckSafeAdd(a, b)); + return a + b; +} +template +T AssertSub(T a, T b) +{ + assert(CheckSafeSub(a, b)); + return a - b; +} + +template +T AssertMul(T a, T b) +{ + assert(CheckSafeMul(a, b)); + return a * b; +} + +template +class Point2 +{ +public: + TYPE x = 0, y = 0; + +public: + Point2(TYPE xx = 0, TYPE yy = 0) : x(xx), y(yy) {} + +public: + TYPE AreaXY() const { return x * y; } + + Point2 operator+(const Point2 &pt) const + { + return Point2(AssertAdd(x, pt.x), AssertAdd(y, pt.y)); + } + Point2 operator-(const Point2 &pt) const + { + return Point2(AssertSub(x, pt.x), AssertSub(y, pt.y)); + } + Point2 operator*(const Point2 &pt) const + { + return Point2(AssertMul(x, pt.x), AssertMul(y, pt.y)); + } + Point2 operator/(const Point2 &pt) const { return Point2(x / pt.x, y / pt.y); } + + bool operator==(const Point2 &pt) const { return (x == pt.x) && (y == pt.y); } + bool operator!=(const Point2 &pt) const { return !((*this) == pt); } + bool operator<(const Point2 &pt) const { return (x < pt.x) || ((x == pt.x) && (y < pt.y)); } + + static Point2 Min(const Point2 &a, const Point2 &b) + { + return Point2(std::min(a.x, b.x), std::min(a.y, b.y)); + } + + static Point2 Max(const Point2 &a, const Point2 &b) + { + return Point2(std::max(a.x, b.x), std::max(a.y, b.y)); + } + + explicit operator uint32_t() const { return (uint32_t(x) << 16) ^ y; } + + explicit operator uint64_t() const { return (uint64_t(x) << 16) ^ y; } +}; + +template +struct Point2Hash +{ + size_t operator()(const Point2 &pt) const { return (pt.x * 8191) ^ pt.y; } +}; + +using Point2i = Point2; + +template +class Point3 +{ +public: + TYPE x = 0, y = 0, z = 0; + +public: + Point3(TYPE xx = 0, TYPE yy = 0, TYPE zz = 0) : x(xx), y(yy), z(zz) {} + +public: + TYPE AreaXY() const { return x * y; } + + Point3 operator+(const Point3 &pt) const { return Point3(x + pt.x, y + pt.y, z + pt.z); } + Point3 operator-(const Point3 &pt) const { return Point3(x - pt.x, y - pt.y, z - pt.z); } + Point3 operator*(const Point3 &pt) const { return Point3(x * pt.x, y * pt.y, z * pt.z); } + Point3 operator/(const Point3 &pt) const { return Point3(x / pt.x, y / pt.y, z / pt.z); } + + bool operator==(const Point3 &pt) const { return (x == pt.x) && (y == pt.y) && (z == pt.z); } + bool operator!=(const Point3 &pt) const { return !((*this) == pt); } + bool operator<(const Point3 &pt) const + { + return (x < pt.x) || ((x == pt.x) && (y < pt.y)) || ((x == pt.x) && (y == pt.y) && (z < pt.z)); + } +}; + + +template +TYPE RoundAway(TYPE value, TYPE align) +{ + assert(align > 0); + TYPE rem = value % align; + if ( rem == 0 ) + { + return value; + } + else if ( rem < 0 ) + { + return value - (align + rem); + } + return value + (align - rem); +} + +inline float RoundAway(float value, float align) +{ + assert(align > 0); + if ( value < 0 ) + { + value = value - align + 1; + } + else + { + value = value + align - 1; + } + return std::trunc(value / align) * align; +} + +template +TYPE RoundAwayZero(TYPE value) +{ + return std::trunc(value + (value < 0 ? -0.5 : 0.5)); +} + +template +TYPE RoundZero(TYPE value, TYPE align) +{ + assert(align > 0); + return value - (value % align); +} + +inline float RoundZero(float value, float align) +{ + return std::trunc(value / align) * align; +} + + +template +TYPE DivRoundUp(TYPE a, TYPE b) +{ + return TYPE((a + b - 1) / b); +} + +// Checks if the ranges overlap, to0 and to1 are exclusive +template +bool Overlaps(TYPE from0, TYPE to0, TYPE from1, TYPE to1) +{ + return from0 < to1 && from1 < to0; +} + +template +TYPE ClampSigmoid(TYPE x, TYPE limit) +{ + if ( x <= -limit ) + { + return TYPE(0); + } + else if ( x >= limit ) + { + return TYPE(1); + } + else + { + return TYPE(1 / (1 + std::exp(-x))); + } +} + +inline int NeededTotalPadding(int inputSize, int outputSize, int stride, int filterSize) +{ + int outSize = DivRoundUp(outputSize, stride); + int neededInput = (outSize - 1) * stride + filterSize; + return std::max(0, neededInput - inputSize); +} + +inline int NeededTotalPadding(int inputSize, int stride, int filterSize) +{ + return NeededTotalPadding(inputSize, inputSize, stride, filterSize); +} + +template +uint32_t SimpleHash32(const VALUE &value) +{ + return uint32_t(value); +} + +template +uint32_t SimpleHash32(const VALUE &value, REST &&...rest) +{ + return SimpleHash32(std::forward(rest)...) * 31 + uint32_t(value); +} + +template +uint64_t SimpleHash64(const VALUE &value) +{ + return uint64_t(value); +} + +template +uint64_t SimpleHash64(const VALUE &value, REST &&...rest) +{ + return SimpleHash64(std::forward(rest)...) * 31 + uint64_t(value); +} + +static constexpr uint32_t REGOR_FNV_SEED = 0x811c9dc5; +static constexpr uint32_t REGOR_FNV_PRIME = 0x01000193; + +inline uint32_t FNVHashBytes(uint32_t hash, const uint8_t *p, int length) +{ + while ( length-- ) + { + hash ^= *p++; + hash *= REGOR_FNV_PRIME; + } + return hash; +} + +template +uint32_t HashVector32(const std::vector &values) +{ + uint32_t hash = REGOR_FNV_SEED; + for ( auto const x : values ) + { + hash = FNVHashBytes(hash, reinterpret_cast(&x), sizeof(x)); + } + return hash; +} + +template +std::make_unsigned_t ToUnsigned(TYPE x) +{ + assert(x >= 0); + return static_cast>(x); +} + +inline int Clz32(uint32_t x) +{ +#if __GNUC__ + return x == 0 ? 32 : __builtin_clz(x); +#elif _MSC_VER + unsigned long leading_zero = 0; + return _BitScanReverse(&leading_zero, x) ? 31 - leading_zero : 32; +#else +#error "Unsupported toolchain" +#endif +} + +inline int Clz64(uint64_t x) +{ +#if __GNUC__ + return x == 0 ? 64 : __builtin_clzll(x); +#elif _MSC_VER + unsigned long leading_zero = 0; + return _BitScanReverse64(&leading_zero, x) ? 63 - leading_zero : 64; +#else +#error "Unsupported toolchain" +#endif +} + +template +int IntLog2(T x) +{ + if ( x <= T(0) ) return 0; + + static_assert(std::is_arithmetic_v && (sizeof(T) <= 8), ""); + using itype = std::conditional_t || sizeof(T) == 8, uint64_t, uint32_t>; + itype n; + + if constexpr ( std::is_floating_point_v ) + { + n = itype(std::ceil(x)); + } + else + { + n = itype(x); + } + + if constexpr ( std::is_same_v ) + { + return 63 - Clz64(ToUnsigned(n)); + } + else + { + return 31 - Clz32(ToUnsigned(n)); + } +} + +template +constexpr bool IsPowerOfTwo(T x) +{ + static_assert(std::is_integral_v, ""); + return x > 0 && (x & (x - 1)) == 0; +} + +template +OUT ClampToType(IN x) +{ + static_assert(std::is_floating_point_v == std::is_floating_point_v, ""); + IN hi = std::numeric_limits::max(); + IN lo; + if constexpr ( std::is_floating_point_v ) + { + lo = -std::numeric_limits::max(); + } + else + { + lo = std::numeric_limits::min(); + } + return OUT(std::clamp(x, lo, hi)); +} diff --git a/ethosu/regor/common/ordered_map.hpp b/ethosu/regor/common/ordered_map.hpp new file mode 100644 index 00000000..35c1703d --- /dev/null +++ b/ethosu/regor/common/ordered_map.hpp @@ -0,0 +1,1112 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace armstd +{ + +template +struct ordered_map_hash +{ + size_t operator()(const TYPE &value, size_t limit) const + { + // A better hash would map the incoming value to log2(limit) bits + // before applying the modulo. + // Disabled for hash performance + // coverity[cert_int33_c_violation] + return std::hash()(value) % limit; + } +}; + +template +struct ordered_map_hash>> +{ + size_t operator()(TYPE value, size_t limit) const + { + uintptr_t hash = uintptr_t(value); + // Pointers tend to have alignments that have lower bits set zero. + // TODO: use alignof(std::remove_pointer_t) to determine lower zero count. + hash ^= (hash >> 3) & 7; + hash ^= (hash >> 15) * 3; + // Disabled for hash performance + // coverity[cert_int33_c_violation] + return size_t(hash % limit); + } +}; + +template +struct ordered_map_hash || std::is_enum_v>> +{ + size_t operator()(TYPE value, size_t limit) const + { + // A C++ std::hash implementation need not return a linear mapping for arithmetic values because it is + // implementation defined. The hash function required for this ordered map should perform exactly the same + // across all platforms and ideally produce sensible hashes for small-valued incrementing keys (not all + // compilers do this). + size_t hash = size_t(value); + // Mix in upper bits if the type is large (enum bit flags, for example) + if constexpr ( sizeof(value) > 2 ) + { + hash ^= (size_t(value) >> 15) * 3; + hash ^= (size_t(value) >> 24) * 5; + } + // Disabled for hash performance + // coverity[cert_int33_c_violation] + return hash % limit; + } +}; + + +template, typename INDEXER = int16_t, bool PURE_HASH_CHAINS = true> +class ordered_map +{ +protected: + using this_class_t = ordered_map; + static_assert(std::is_integral::value, "indexer must be integer"); + static constexpr INDEXER HASH_FREE = INDEXER(-2); + static constexpr INDEXER HASH_END = INDEXER(-1); + static constexpr INDEXER NODE_UNLINKED = INDEXER(-1); + + // Node stores two arrays in one allocation. + // - Items array, mapping key->untyped storage + // - The traversal order list + struct Node + { + typename std::aligned_storage::type value; // Untyped value storage (place + // first) + KEY key; // Map key + INDEXER order_next = NODE_UNLINKED; // order forwards traversal + INDEXER order_prev = NODE_UNLINKED; // order backwards traversal + INDEXER hash_next = HASH_FREE; // Same hash collision relocation (-2=free, -1=used/end, otherwise=next bucket) + + Node() = default; + + VALUE &Value() { return *reinterpret_cast(&value); } + + const VALUE &Value() const { return *reinterpret_cast(&value); } + + void copy_links(Node &other) + { + this->order_next = other.order_next; + this->order_prev = other.order_prev; + this->hash_next = other.hash_next; + } + }; + + std::unique_ptr _items; // Bulk node store + int _capacity = 0; // Total allocated capacity + int _itemCount = 0; // Number of inserted/valid items + INDEXER _tableSize = 0; // Hash table size + INDEXER _orderBegin = NODE_UNLINKED; // First item in insertion order + INDEXER _orderLast = NODE_UNLINKED; // Last item in insertion order + INDEXER _allocWatermark = 0; // Location of last overflow allocation + +public: + ordered_map(int initialCapacity = 3) { resize(initialCapacity); } + + ordered_map(std::initializer_list> init) + { + resize(int(init.size())); + for ( auto &pair : init ) + { + emplace(pair.first, pair.second); + } + } + + ordered_map(const std::pair *pairs, size_t count) + { + resize(int(count)); + while ( count-- ) + { + emplace(pairs->first, pairs->second); + pairs++; + } + } + + ordered_map(const ordered_map &other) { *this = other; } + + ordered_map(ordered_map &&other) { *this = std::move(other); } + + ordered_map &operator=(const ordered_map &other) + { + // Can't selectively delete operator= via enable_if_t + if constexpr ( !std::is_copy_constructible::value ) + { + throw std::invalid_argument("value type non-copyable"); + } + else if ( this != &other ) + { + clear(); + + // Duplicating a map will compact and rehash it + // pad a little to prevent an immediate resize + // if the user appends an item. + resize(other._itemCount + 2); + + // Duplicate in source's insertion order + int order = other._orderBegin; + while ( order != NODE_UNLINKED ) + { + auto *from = &other._items[order]; + int index = table_index_of(from->key); + auto *to = allocate_node(index, from->key); + to->key = from->key; + ::new (reinterpret_cast(&to->Value())) VALUE(from->Value()); + order = from->order_next; + } + } + return *this; + } + + ordered_map &operator=(ordered_map &&other) + { + if ( this != &other ) + { + clear(); + + _items = std::move(other._items); + _capacity = other._capacity; + _itemCount = other._itemCount; + _tableSize = other._tableSize; + _orderBegin = other._orderBegin; + _orderLast = other._orderLast; + _allocWatermark = other._allocWatermark; + other._itemCount = 0; + other._capacity = 0; + other._orderBegin = NODE_UNLINKED; + } + return *this; + } + + ~ordered_map() { clear(); } + +public: + int size() const { return _itemCount; } + bool empty() const { return _itemCount == 0; } + bool contains(const KEY &key) const + { + int index = table_index_of(key); + return find_node(index, key) != nullptr; + } + + void clear() + { + if ( !_items ) return; + // Call destructors in order + int order = _orderBegin; + for ( int i = 0; i < _itemCount; i++ ) + { + assert((i < _itemCount - 1) || (order == _orderLast)); // Consistency error + _items[order].Value().~VALUE(); + order = _items[order].order_next; + } + assert(order == NODE_UNLINKED); // Consistency error + + // Reset every entry in the hash table + for ( int i = 0; i < _capacity; i++ ) + { + _items[i].hash_next = HASH_FREE; + _items[i].order_next = NODE_UNLINKED; + _items[i].order_prev = NODE_UNLINKED; + } + + _itemCount = 0; + _orderBegin = NODE_UNLINKED; + _orderLast = NODE_UNLINKED; + } + + template + VALUE &emplace(const KEY &key, Args &&...args) + { + int index = table_index_of(key); + Node *node = find_node(index, key); + if ( node == nullptr ) + { + node = allocate_node(index, key); + ::new (reinterpret_cast(&node->value)) VALUE(std::forward(args)...); + } + return node->Value(); + } + + // insert(key, value) + // - Appends a new node for key at the end of the iteration order and copies or moves value into it + // - The key must not already be present in the map + // - If the map is at capacity then resizing will occur + // - If resizing occurs, all iterators and references are invalidated. Otherwise, they are unaffected. + + void insert(const KEY &key, const VALUE &value) + { + int index = table_index_of(key); + Node *node = find_node(index, key); + if ( node == nullptr ) + { + node = allocate_node(index, key); + ::new (reinterpret_cast(&node->value)) VALUE(value); // Requires accessible copy constructor + } + else + { + assert(false && "key already present"); + } + } + + void insert(const KEY &key, VALUE &&value) + { + int index = table_index_of(key); + Node *node = find_node(index, key); + if ( node == nullptr ) + { + node = allocate_node(index, key); + ::new (reinterpret_cast(&node->value)) VALUE(std::move(value)); // Requires accessible move + // constructor + } + else + { + assert(false && "key already present"); + } + } + + // reinsert(key, value) + // - If key is not already present in the map: + // - Is equivalent to insert + // - Appends a new node for key at the end of the iteration order and copies or moves value into it + // - If the map is at capacity then resizing will occur + // - If resizing occurs, all iterators and references are invalidated. Otherwise, they are unaffected. + // - If key is already present in the map: + // - Moves the node at key to the back of the iteration order + // - Replaces the VALUE object at key with value + // - No hash chains are modified + // - All iterators remain valid, but iterators to the reinserted node will point to its new position + + void reinsert(const KEY &key, const VALUE &value) + { + int index = table_index_of(key); + Node *node = find_node(index, key); + if ( node == nullptr ) + { + node = allocate_node(index, key); + ::new (reinterpret_cast(&node->value)) VALUE(value); // Requires accessible copy constructor + } + else // Reinsert re-links at end of ordering chain + { + if ( node->order_next != NODE_UNLINKED ) // No need to relink if already at end of chain + { + index = int(node - _items.get()); // Get actual node index + unlink_node_order(node); + + // relink node at end of chain + _items[_orderLast].order_next = INDEXER(index); + node->order_prev = _orderLast; + node->order_next = NODE_UNLINKED; + _orderLast = INDEXER(index); + } + + // Re-assign value (last) + node->Value() = value; + } + } + + void reinsert(const KEY &key, VALUE &&value) + { + int index = table_index_of(key); + Node *node = find_node(index, key); + if ( node == nullptr ) + { + node = allocate_node(index, key); + ::new (reinterpret_cast(&node->value)) VALUE(std::move(value)); // Requires accessible move + // constructor + } + else // Reinsert re-links at end of ordering chain + { + if ( node->order_next != NODE_UNLINKED ) // No need to relink if already at end of chain + { + index = int(node - _items.get()); // Get actual node index + unlink_node_order(node); + + // relink node at end of chain + _items[_orderLast].order_next = INDEXER(index); + node->order_prev = _orderLast; + node->order_next = NODE_UNLINKED; + _orderLast = INDEXER(index); + } + + // Re-assign value (last) + node->Value() = std::move(value); + } + } + + // replace(oldKey, newKey, value) + // - Replaces the node at oldKey with a new key and value but same position in iteration order + // - The new key must not already be present in the map + // - All iterators and references are invalidated + // - If the map is at its capacity, resizing will occur + + void replace(const KEY &oldKey, const KEY &newKey, const VALUE &value) + { + insert(newKey, value); + swap_order(find(oldKey), find(newKey)); + erase(oldKey); + } + + // operator[] + // - Analogous to unordered_map::operator[] + // - If insertion occurs, the new node is appended to the end of the iteration order + // - If insertion occurs and the map is at capacity then resizing will occur + // - If resizing occurs, all iterators and references are invalidated. Otherwise, they are unaffected. + + VALUE &operator[](const KEY &key) + { + int index = table_index_of(key); + Node *node = find_node(index, key); + if ( node == nullptr ) + { + node = allocate_node(index, key); + ::new (reinterpret_cast(&node->value)) VALUE(); + } + return node->Value(); + } + + const VALUE &operator[](const KEY &key) const + { + int index = table_index_of(key); + const Node *node = find_node(index, key); + if ( node == nullptr ) throw std::out_of_range("missing key"); + return node->Value(); + } + + const VALUE &at(const KEY &key) const + { + int index = table_index_of(key); + const Node *node = find_node(index, key); + if ( node == nullptr ) throw std::out_of_range("missing key"); + return node->Value(); + } + + VALUE &at(const KEY &key) { return const_cast(const_cast(this)->at(key)); } + + bool try_get(const KEY &key, VALUE &value) const + { + int index = table_index_of(key); + const Node *node = find_node(index, key); + if ( node != nullptr ) + { + value = node->Value(); + return true; + } + return false; + } + + const VALUE *try_ref(const KEY &key) const + { + int index = table_index_of(key); + const Node *node = find_node(index, key); + if ( node != nullptr ) + { + return &node->Value(); + } + return nullptr; + } + + VALUE *try_ref(const KEY &key) { return const_cast(const_cast(this)->try_ref(key)); } + + const VALUE &front() const + { + if ( empty() ) throw std::out_of_range("no keys"); + return _items[_orderBegin].Value(); + } + + const VALUE &back() const + { + if ( empty() ) throw std::out_of_range("no keys"); + return _items[_orderLast].Value(); + } + + // Return all keys (in insertion order) + std::vector keys() const + { + std::vector tmp; + tmp.reserve(_itemCount); + // Collect the keys in order + int order = _orderBegin; + for ( int i = 0; i < _itemCount; i++ ) + { + const Node &item = _items[order]; + assert(item.hash_next != HASH_FREE); + tmp.push_back(item.key); + order = item.order_next; + } + assert(order == NODE_UNLINKED); + return tmp; + } + + template + class iterator_base + { + friend this_class_t; + using value_ref_t = typename std::conditional_t; + using value_ptr_t = typename std::conditional_t; + using node_t = typename std::conditional_t; + using iterator_base_t = iterator_base; + + protected: + node_t _items; + int _at; + + public: + iterator_base() = default; + iterator_base(const iterator_base_t &other) { *this = other; } + iterator_base(node_t items, int start) : _items(items), _at(start) {} + + // Value-only access + template = 0> + value_ref_t operator*() const + { + return _items[_at].Value(); + } + + template = 0> + value_ptr_t operator->() + { + return &_items[_at].Value(); + } + + // Pair access + template = 0> + const KEY &key() const + { + return _items[_at].key; + } + template = 0> + value_ref_t value() const + { + return _items[_at].Value(); + } + + template = 0> + std::pair operator*() const + { + return std::pair(key(), value()); + } + + iterator_base_t &operator++() + { + if constexpr ( REVERSE ) _at = _items[_at].order_prev; + else _at = _items[_at].order_next; + return *this; + } + iterator_base_t operator++(int) + { + int tmp = _at; + if constexpr ( REVERSE ) _at = _items[_at].order_prev; + else _at = _items[_at].order_next; + return iterator_base_t(_items, tmp); + } + + iterator_base_t &operator--() + { + if constexpr ( REVERSE ) _at = _items[_at].order_next; + else _at = _items[_at].order_prev; + return *this; + } + iterator_base_t operator--(int) + { + int tmp = _at; + if constexpr ( REVERSE ) _at = _items[_at].order_next; + else _at = _items[_at].order_prev; + return iterator_base_t(_items, tmp); + } + + iterator_base_t &operator=(const iterator_base_t &other) + { + _items = other._items; + _at = other._at; + return *this; + } + + bool operator==(const iterator_base &b) const + { + assert(_items == b._items); + return _at == b._at; + } + + bool operator==(const iterator_base &b) const + { + assert(_items == b._items); + return _at == b._at; + } + + bool operator!=(const iterator_base &b) const + { + assert(_items == b._items); + return _at != b._at; + } + + bool operator!=(const iterator_base &b) const + { + assert(_items == b._items); + return _at != b._at; + } + + protected: + Node *node() const { return const_cast(&_items[_at]); } + }; + + using iterator = iterator_base; + using reverse_iterator = iterator_base; + using pair_iterator = iterator_base; + using const_iterator = iterator_base; + using const_reverse_iterator = iterator_base; + using const_pair_iterator = iterator_base; + + // Forward value iterators + iterator begin() { return iterator(_items.get(), _orderBegin); } + iterator end() { return iterator(_items.get(), NODE_UNLINKED); } + const_iterator begin() const { return const_iterator(_items.get(), _orderBegin); } + const_iterator end() const { return const_iterator(_items.get(), NODE_UNLINKED); } + + // Reverse value iterators + reverse_iterator rbegin() { return reverse_iterator(_items.get(), _orderLast); } + reverse_iterator rend() { return reverse_iterator(_items.get(), NODE_UNLINKED); } + const_reverse_iterator rbegin() const { return const_reverse_iterator(_items.get(), _orderLast); } + const_reverse_iterator rend() const { return const_reverse_iterator(_items.get(), NODE_UNLINKED); } + + template + class iterator_proxy + { + private: + using outer_type_t = typename std::conditional_t; + outer_type_t _outer; + + public: + iterator_proxy(outer_type_t outer) : _outer(outer) {} + pair_iterator begin() { return pair_iterator(_outer._items.get(), _outer._orderBegin); } + pair_iterator end() { return pair_iterator(_outer._items.get(), NODE_UNLINKED); } + const_pair_iterator begin() const { return const_pair_iterator(_outer._items.get(), _outer._orderBegin); } + const_pair_iterator end() const { return const_pair_iterator(_outer._items.get(), NODE_UNLINKED); } + }; + + iterator_proxy pairs() { return iterator_proxy(*this); } + iterator_proxy pairs() const { return iterator_proxy(*this); } + + iterator find(const KEY &key) + { + int index = table_index_of(key); + const Node *node = find_node(index, key); + if ( node == nullptr ) + { + return end(); + } + return iterator(_items.get(), int(node - _items.get())); + } + + const_iterator find(const KEY &key) const + { + int index = table_index_of(key); + const Node *node = find_node(index, key); + if ( node == nullptr ) + { + return end(); + } + return const_iterator(_items.get(), int(node - _items.get())); + } + + // reorder_after(key, position) and reorder_before(key, position) + // - Moves a node to a new position in iteration order + // - reorder_after() moves the node to immediately after position in iteration order + // - reorder_before() moves the node to immediately before position in iteration order + // - Key must already be present in the map + // - end() can be used as the position for reorder_before() but not reorder_after() + // - No hash chains are modified + // - No VALUE objects are moved, copied, created or destroyed + // - Container size is not changed and no reallocation occurs + // - All iterators remain valid, but iterators to the reordered node will point to its new position + + void reorder_after(const KEY &key, iterator position) + { + int index = table_index_of(key); + Node *node = find_node(index, key); + reorder_node(node, position.node(), true); + } + + void reorder_before(const KEY &key, iterator position) + { + int index = table_index_of(key); + Node *node = find_node(index, key); + if ( position == end() ) + { + reorder_node(node, &_items[_orderLast], true); + } + else + { + reorder_node(node, position.node(), false); + } + } + + // erase(const KEY &key) + // - Analogous to std::unordered_map::erase() + // - Removes key from map, if present, and returns the number of elements removed + // - Unlike unordered_map::erase(), all iterators and references are invalidated (not just the removed element) + + int erase(const KEY &key) + { + int index = table_index_of(key); + int prevIndex = -1; // local sentinel - not the indexer value + Node *node = find_node(index, key, &prevIndex); + if ( node != nullptr ) + { + deallocate_node(node, prevIndex); + return 1; + } + return 0; + } + + // erase(iterator pos) + // - Analogous to std::vector::erase() + // - Removes the node at pos and returns an iterator to the next node in iteration order + // - Unlike vector::erase(), all iterators and references are invalidated (not just the removed element onwards) + + iterator erase(iterator pos) + { + assert(pos._at != NODE_UNLINKED); + const auto &key = _items[pos._at].key; + int index = table_index_of(key); // Locate in hash chain (iterators walk the ordering chain) + int prevIndex = -1; // local sentinel - not the indexer value + int nextOrder = _items[pos._at].order_next; + Node *node = find_node(index, key, &prevIndex); + assert(node != nullptr); + deallocate_node(node, prevIndex); + return iterator(_items.get(), nextOrder); + } + + // swap_order() + // - Swaps the iteration order of two nodes specified by iterator + // - Both iterators must be dereferenceable + // - No hash chains are modified + // - No VALUE objects are moved, copied, created or destroyed + // - Container size is not changed and no reallocation occurs + // - All iterators remain valid, but iterators to either of the two swapped nodes will point to their new position + + void swap_order(iterator first, iterator second) + { + assert(first != end() && second != end()); + swap_node_order(first.node(), second.node()); + } + + bool key_of(const VALUE &value, KEY &key) const + { + int order = _orderBegin; + while ( order != NODE_UNLINKED ) + { + auto *node = &_items[order]; + if ( node->Value() == value ) + { + key = node->key; + return true; + } + order = node->order_next; + } + return false; + } + +private: + ordered_map(this_class_t &other, int capacity) + { + assert(capacity < std::numeric_limits::max() - 2); // Capacity exceeds indexer + + resize(capacity); + + // Duplicate in source's insertion order + int order = other._orderBegin; + while ( order != NODE_UNLINKED ) + { + auto *from = &other._items[order]; + int index = table_index_of(from->key); + auto *to = allocate_node(index, from->key); + copy_move_helper()(to->key, from->key); + ::new (reinterpret_cast(&to->value)) VALUE(); + copy_move_helper()(to->Value(), from->Value()); + order = from->order_next; + } + } + + void resize(int capacity) + { + int newTableSize = hashtable_size(capacity); + assert(capacity < std::numeric_limits::max() - 2); // Capacity exceeds indexer + if ( capacity <= _capacity ) return; + + // Same hash table size, just move old items into the new item storage + if ( !_items || _tableSize == newTableSize ) + { + std::unique_ptr newItems = std::make_unique(capacity); + if ( _items ) + { + // Probably resizing because we are full, so move everything + for ( int i = 0; i < _capacity; i++ ) + { + auto *from = &_items[i]; + auto *to = &newItems[i]; + to->copy_links(*from); + copy_move_helper()(to->key, from->key); + ::new (reinterpret_cast(&to->value)) VALUE(); + copy_move_helper()(to->Value(), from->Value()); + } + } + _items = std::move(newItems); + _capacity = capacity; + _tableSize = INDEXER(newTableSize); + _allocWatermark = INDEXER(capacity - 1); + } + else // Rehash by stealing the internals of another map + { + // This may occur recursively if the hash function is poor + this_class_t temp(*this, capacity); + *this = std::move(temp); + } + assert(_tableSize != 0); + } + + Node *allocate_node(int index, const KEY &key) + { + // This function must only be called when an allocation is required + assert(index >= 0 && index < _tableSize); + // Try to insert into the hash table, if given index isn't free then find a free node and link onto that + while ( _items[index].hash_next != HASH_FREE ) + { + int prev = index; + index = find_free_index(); + if ( index < 0 ) + { + // Conservative resize strategy + resize(_capacity + (_capacity + 1) / 2); + index = table_index_of(key); + continue; + } + + // Find the end of the hash chain + while ( _items[prev].hash_next != HASH_END ) + { + prev = _items[prev].hash_next; + } + + _items[prev].hash_next = INDEXER(index); + } + + Node *node = &_items[index]; + node->hash_next = HASH_END; // this node is now used but not linked + node->key = key; + if ( _orderBegin == NODE_UNLINKED ) + { + _orderBegin = _orderLast = INDEXER(index); + node->order_next = node->order_prev = NODE_UNLINKED; + } + else + { + _items[_orderLast].order_next = INDEXER(index); + node->order_next = NODE_UNLINKED; + node->order_prev = _orderLast; + _orderLast = INDEXER(index); + } + + _itemCount++; + return node; + } + + void deallocate_node(Node *node, int prevIndex) + { + // Remove from ordering chain FIRST: + unlink_node_order(node); + + // Remove from hash chain: Just unlink node if not in initial bucket + if ( prevIndex != -1 ) + { + node->Value().~VALUE(); + _items[prevIndex].hash_next = node->hash_next; + node->hash_next = HASH_FREE; + } + else if ( node->hash_next == HASH_END ) + { + node->Value().~VALUE(); + node->hash_next = HASH_FREE; + } + // If we are at the start of a chain then we need to move the next hashed node into this bucket (painful) + else + { + // If the hash table is used as overflow allocation (i.e. PURE_HASH_CHAINS == false) + // the next node in this chain may also be the start of another chain, which is not handled here. + assert(PURE_HASH_CHAINS); // TODO: Add support for deallocation when PURE_HASH_CHAINS==false + + Node *next = &_items[node->hash_next]; + copy_move_helper()(node->Value(), next->Value()); + node->key = next->key; + node->hash_next = next->hash_next; + next->hash_next = HASH_FREE; + + // Relink ordering for the moved node + node->order_next = next->order_next; + node->order_prev = next->order_prev; + update_node_order(node); + + next->order_next = NODE_UNLINKED; + next->order_prev = NODE_UNLINKED; + } + + _itemCount--; + } + + void unlink_node_order(Node *node) + { + // If this is not the first node, unlink from previous + if ( node->order_prev != NODE_UNLINKED ) + { + _items[node->order_prev].order_next = node->order_next; + } + else + { + _orderBegin = node->order_next; + } + + // If this is not the last node, unlink from next + if ( node->order_next != NODE_UNLINKED ) + { + _items[node->order_next].order_prev = node->order_prev; + } + else + { + _orderLast = node->order_prev; + } + + node->order_next = NODE_UNLINKED; + node->order_prev = NODE_UNLINKED; + } + + void update_node_order(Node *node) + { + const auto index = INDEXER(node - _items.get()); + if ( node->order_prev == NODE_UNLINKED ) + { + _orderBegin = index; + } + else + { + _items[node->order_prev].order_next = index; + } + if ( node->order_next == NODE_UNLINKED ) + { + _orderLast = index; + } + else + { + _items[node->order_next].order_prev = index; + } + } + + void reorder_node(Node *node, Node *position, bool after) + { + assert(node != nullptr); + assert(position != nullptr); + + if ( node != position ) + { + unlink_node_order(node); + + if ( after ) + { + node->order_next = position->order_next; + node->order_prev = INDEXER(position - _items.get()); + } + else + { + node->order_next = INDEXER(position - _items.get()); + node->order_prev = position->order_prev; + } + + update_node_order(node); + } + } + + void swap_node_order(Node *first, Node *second) + { + // Given the two nodes, swap the traversal orders + const INDEXER firstPrev = first->order_prev; + const INDEXER firstNext = first->order_next; + + first->order_prev = second->order_prev; + first->order_next = second->order_next; + + second->order_prev = firstPrev; + second->order_next = firstNext; + + // If the two nodes were adjacent before swapping, then they will each be pointing at themselves now. + // Point them at each other instead. + const auto firstIndex = INDEXER(first - _items.get()); + const auto secondIndex = INDEXER(second - _items.get()); + + if ( firstPrev == secondIndex ) + { + first->order_next = secondIndex; + second->order_prev = firstIndex; + } + else if ( firstNext == secondIndex ) + { + first->order_prev = secondIndex; + second->order_next = firstIndex; + } + + // Inform the affected neighbours of the change in order + update_node_order(first); + update_node_order(second); + } + + const Node *find_node(int index, const KEY &key, int *prev = nullptr) const + { + // Initial index must be within the hashtable + assert(index >= 0 && index < _tableSize); + + // Node is unallocated + if ( _items[index].hash_next == HASH_FREE ) + { + return nullptr; + } + + // Look for matching key + do + { + if ( _items[index].key == key ) + { + assert(_items[index].hash_next != HASH_FREE); // This node MUST BE allocated + return &_items[index]; + } + if ( prev ) + { + *prev = index; + } + index = _items[index].hash_next; + } while ( index != HASH_END ); + + return nullptr; + } + + Node *find_node(int index, const KEY &key, int *prev = nullptr) + { + return const_cast(const_cast(this)->find_node(index, key, prev)); + } + + template + struct copy_move_helper + { + void operator()(TYPE &dst, TYPE &src) const + { + dst = src; + src.~TYPE(); + } + }; + + template + struct copy_move_helper::value>::type> + { + void operator()(TYPE &dst, TYPE &src) const { dst = std::move(src); } + }; + + template + // clang-format off + struct copy_move_helper::value && !(std::is_move_assignable::value || std::is_arithmetic::value || std::is_pointer::value)>::type > + // clang-format on + { + // coverity[dont_call:FALSE] + void operator()(TYPE &dst, TYPE &src) const { std::memcpy(&dst, &src, sizeof(TYPE)); } + }; + + int table_index_of(const KEY &key, int tableSize) const { return int(HASH()(key, size_t(tableSize))); } + int table_index_of(const KEY &key) const { return table_index_of(key, _tableSize); } + + int find_free_index() + { + const int minAlloc = PURE_HASH_CHAINS ? _tableSize : 0; + + // Crude search for a free slot (start at the watermark) + for ( int i = _allocWatermark; i >= minAlloc; --i ) + { + if ( _items[i].hash_next == HASH_FREE ) + { + _allocWatermark = INDEXER(i - 1); + return i; + } + } + for ( int i = _capacity - 1; i > _allocWatermark; --i ) + { + if ( _items[i].hash_next == HASH_FREE ) + { + _allocWatermark = INDEXER(i - 1); + return i; + } + } + return -1; // Error, no free slots + } + + int hashtable_size(int &capacity) + { + // Prime table is tuned for rehashing while the capacity is small. Whereas + // table sizes for larger capacities are sparser, when rehashing is expensive. + // Note: Not all starting capacities give the same resize pattern! + // clang-format off + static constexpr int primes[] = { 3, 7, 11, 13, 17, 19, 23, 31, 41, 67, 97, 131, + 197, 257, 509, 1021, 2039, 4093, 8191, 16381 }; + // clang-format on + assert(capacity < std::numeric_limits::max() - 2); // Capacity exceeds indexer + capacity = std::max(capacity, 2); + + // Estimate to expect ~1 collision per element + int estimatedTableSize = (capacity + 1) / 2; + int tableSize = 2; + // Choose a conservative prime hashtable size for small capacities + // (stops rehashing after final table size has been seen) + for ( int i : primes ) + { + if ( i > estimatedTableSize ) + { + break; + } + tableSize = i; + } + + // Ensure capacity includes space other than just the table + capacity = std::max(capacity, tableSize * 2); + assert(tableSize != 0); + return tableSize; + } +}; + +} // namespace armstd + + +// WORKAROUND: Pull into regor namespace +namespace regor +{ +template, typename INDEXER = int16_t, bool PURE_HASH_CHAINS = true> +using ordered_map = armstd::ordered_map; + +} // namespace regor diff --git a/ethosu/regor/common/reverse_type.cpp b/ethosu/regor/common/reverse_type.cpp new file mode 100644 index 00000000..6525ccd6 --- /dev/null +++ b/ethosu/regor/common/reverse_type.cpp @@ -0,0 +1,30 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common/reverse_type.hpp" + +#include "common/logging.hpp" + +#include "common/bit_flags.hpp" + +BEGIN_ENUM_TABLE(ReverseType) + ADD_ENUM_NAME(None) + ADD_ENUM_NAME(H) + ADD_ENUM_NAME(W) + ADD_ENUM_NAME(C) +END_ENUM_TABLE() diff --git a/ethosu/regor/common/reverse_type.hpp b/ethosu/regor/common/reverse_type.hpp new file mode 100644 index 00000000..280d76aa --- /dev/null +++ b/ethosu/regor/common/reverse_type.hpp @@ -0,0 +1,28 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once +#include "common/bit_flags.hpp" + +enum class ReverseType : uint8_t +{ + None = 0x0, + C = 0x1, + W = 0x2, + H = 0x4 +}; diff --git a/ethosu/regor/common/scaling.cpp b/ethosu/regor/common/scaling.cpp new file mode 100644 index 00000000..f9fdc664 --- /dev/null +++ b/ethosu/regor/common/scaling.cpp @@ -0,0 +1,93 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "scaling.hpp" + +#include "common/numeric_util.hpp" + +#include +#include + +bool QuantizedScale::operator==(const QuantizedScale &other) const +{ + return (scale == other.scale) && (shift == other.shift); +} + +bool QuantizedScale::operator!=(const QuantizedScale &other) const +{ + return !(*this == other); +} + +QuantizedScale::QuantizedScale(double scale_, bool reduced) +{ + int exponent = 0; + int leftShift = reduced ? 15 : 31; + double significand = std::frexp(scale_, &exponent); + // convert from left to right-shift + scale = int32_t(std::round(significand * double(1LL << leftShift))); + shift = leftShift - exponent; + // if shift is out of bounds [0,63], try to get back within bounds + if ( shift > 63 && scale > std::exp2(shift - 63) ) + { + scale = scale >> (shift - 63); + shift = 63; + } + else if ( shift < 0 && scale < std::exp2(shift + 32) ) + { + scale = scale << (0 - shift); + shift = 0; + } +} + +double QuantizedScale::Dequantize() const +{ + double significand = double(scale); + // ldexp expects a left-shift + // so we convert from right to left-shift + int exp = -shift; + return std::ldexp(significand, exp); +} + +QuantizedScale ElementwiseMulScale(const double inputScale, const double input2Scale, const double outputScale) +{ + // clamp to single-point precision + float ifm1Scale = ClampToType(inputScale); + float ifm2Scale = ClampToType(input2Scale); + float outScale = ClampToType(outputScale); + + float outputRescale = (ifm1Scale * ifm2Scale) / outScale; + return QuantizedScale(outputRescale); +} + +// Convert int32_t multiplier to int16_t with rounding. +int16_t DownScaleInt32ToInt16Multiplier(int32_t multiplier) +{ + static constexpr int32_t kRoundingOffset = 1 << 15; + int16_t mul16; + + if ( multiplier >= std::numeric_limits::max() - kRoundingOffset ) + { + mul16 = std::numeric_limits::max(); + } + else + { + mul16 = int16_t((multiplier + kRoundingOffset) >> 16); + } + + return mul16; +} diff --git a/ethosu/regor/common/scaling.hpp b/ethosu/regor/common/scaling.hpp new file mode 100644 index 00000000..dad14714 --- /dev/null +++ b/ethosu/regor/common/scaling.hpp @@ -0,0 +1,49 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once +#include + +class QuantizedScale +{ +public: + int32_t scale; + int shift; + +public: + QuantizedScale() = default; + QuantizedScale(int32_t scale_, int shift_) : scale(scale_), shift(shift_) {} + /** + * Creates a quantized scale and shift from a floating-point + * scale: floating-point representation of the scale + * reduced: reduces the quantization to 16-bit (default 32) + */ + QuantizedScale(double scale_, bool reduced = false); + /* + * Dequantizes scale into floating-point + */ + double Dequantize() const; + bool operator==(const QuantizedScale &other) const; + bool operator!=(const QuantizedScale &other) const; +}; + +/* Calculate elementwise Mul OFM QuantizedScale */ +QuantizedScale ElementwiseMulScale(double inputScale, double input2Scale, double outputScale); + +/* Convert int32_t multiplier to int16_t with rounding. */ +int16_t DownScaleInt32ToInt16Multiplier(int32_t mul); diff --git a/ethosu/regor/common/shape.hpp b/ethosu/regor/common/shape.hpp new file mode 100644 index 00000000..60e673aa --- /dev/null +++ b/ethosu/regor/common/shape.hpp @@ -0,0 +1,733 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common.hpp" +#include "numeric_util.hpp" +#include "transpose_type.hpp" + +#include +#include +#include +#include +#include +#include + +/// +/// Multi-axis shape description (stores axis dimensions backwards) +/// +class Shape +{ +private: + static constexpr int MAX_STATIC_AXES = 4; + union + { + int32_t axes[MAX_STATIC_AXES]; + int32_t *ptr; + } _storage; + int8_t _last = -1; // Invalid + bool _dynamic = false; + +public: + Shape() {} + + Shape(int c) + { + Init(1); + At(0) = c; + } + + Shape(int w, int c) + { + Init(2); + At(0) = c; + At(1) = w; + } + + Shape(int h, int w, int c) + { + Init(3); + At(0) = c; + At(1) = w; + At(2) = h; + } + + Shape(int n, int h, int w, int c) + { + Init(4); + At(0) = c; + At(1) = w; + At(2) = h; + At(3) = n; + } + + template + Shape(const TYPE *axes, size_t length) + { + assert(length < size_t(std::numeric_limits::max())); + Init(int(length)); + if ( axes != nullptr ) + { + auto *local = Storage(); + // Reverses input into position + assert(size_t(_last) == length - 1); + for ( size_t i = 0; i < length; i++ ) + { + local[_last - i] = int32_t(axes[i]); + } + } + } + + Shape(std::nullptr_t, int length, int fillValue = 0) { Init(length, fillValue); } + + Shape(const Shape &other) + { + if ( other.IsValid() ) + { + Init(other.Size()); + std::copy_n(other.Storage(), Size(), Storage()); + } + } + + explicit Shape(Shape &&other) + { + _storage = other._storage; + _dynamic = other._dynamic; + _last = other._last; + other._storage.ptr = nullptr; + other._dynamic = false; + } + + Shape(const Shape &other, int length, int padValue = 0) + { + if ( other.IsValid() ) + { + Init(length, padValue); + std::copy_n(other.Storage(), std::min(other.Size(), length), Storage()); + } + } + + ~Shape() { Free(); } + +public: + int &operator[](int index) + { + int offset = ToOffset(index); + assert((offset >= 0) && (offset <= _last)); + return At(offset); + } + + int operator[](int index) const + { + int offset = ToOffset(index); + assert((offset >= 0) && (offset <= _last)); + return At(offset); + } + + Shape &operator=(const Shape &other) + { + if ( &other != this ) + { + Free(); + if ( other.IsValid() ) + { + Init(other.Size()); + std::copy_n(other.Storage(), Size(), Storage()); + } + } + return *this; + } + + Shape &operator=(Shape &&other) + { + if ( &other != this ) + { + Free(); + _storage = other._storage; + _dynamic = other._dynamic; + _last = other._last; + other._storage.ptr = nullptr; + other._dynamic = false; + other._last = -1; + } + return *this; + } + + bool operator==(const Shape &other) const + { + if ( other._last != _last ) + { + return false; + } + + auto *from = other.Storage(); + auto *local = Storage(); + for ( int i = 0; i <= _last; i++ ) + { + if ( local[i] != from[i] ) + { + return false; + } + } + return true; + } + + explicit operator uint32_t() const { return At(0) ^ (At(1) << 8) ^ (At(2) << 16) ^ (At(3) << 24); } + + explicit operator uint64_t() const { return uint64_t(uint32_t(*this)); } + + explicit operator bool() const { return IsValid(); } + + bool operator!=(const Shape &other) const { return !((*this) == other); } + + // Required for use in maps/sets + bool operator<(const Shape &other) const + { + auto *from = other.Storage(); + auto *local = Storage(); + + for ( int i = std::min(other._last, _last); i >= 0; i-- ) + { + if ( local[i] < from[i] ) + { + return true; + } + else if ( local[i] > from[i] ) + { + return false; + } + } + + return false; + } + + bool operator<=(const Shape &other) const { return *this < other || *this == other; } + + Shape operator+(const Shape &other) const { return Shape::MaxFunc, false, 0>(*this, other); } + + Shape operator-(const Shape &other) const { return Shape::MaxFunc, false, 0>(*this, other); } + + Shape operator%(const Shape &other) const { return Shape::MaxFunc, false, 1>(*this, other); } + + Shape operator/(const Shape &other) const { return Shape::MaxFunc, false, 1>(*this, other); } + + Shape operator*(int scale) const { return Shape::ScalarFunc>(*this, scale); } + + Shape operator/(int scale) const { return Shape::ScalarFunc>(*this, scale); } + + Shape &operator+=(const Shape &other) + { + Shape tmp = *this + other; + *this = std::move(tmp); + return *this; + } + + Shape &operator-=(const Shape &other) + { + Shape tmp = *this - other; + *this = std::move(tmp); + return *this; + } + + int Dot(const Shape &other) const + { + int result = 0; + if ( VERIFY(other.Size() == Size()) ) + { + auto *from = other.Storage(); + auto *local = Storage(); + for ( int i = 0; i <= _last; i++ ) + { + result += local[i] * from[i]; + } + } + return result; + } + + Shape With(int index, int value) const + { + Shape tmp(*this, std::max(Size(), index + 1)); + tmp.At(ToOffset(index)) = value; + return tmp; + } + + Shape WithBatch(int n) const + { + Shape tmp(*this, std::max(Size(), 4)); + tmp.At(3) = n; + return tmp; + } + + Shape WithHeight(int h) const + { + Shape tmp(*this, std::max(Size(), 3)); + tmp.At(2) = h; + return tmp; + } + + Shape WithWidth(int w) const + { + Shape tmp(*this, std::max(Size(), 2)); + tmp.At(1) = w; + return tmp; + } + + Shape WithDepth(int d) const + { + Shape tmp(*this, std::max(Size(), 1)); + tmp.At(0) = d; + return tmp; + } + + Shape WithHW(int h, int w) const + { + Shape tmp(*this, std::max(Size(), 3)); + tmp.At(2) = h; + tmp.At(1) = w; + return tmp; + } + + Shape WithZeros() const { return Shape(nullptr, Size()); } + + Shape WithOnes() const + { + Shape tmp(nullptr, Size()); + std::fill_n(tmp.Storage(), Size(), 1); + return tmp; + } + + Shape Insert(int index, int value) const + { + Shape tmp(nullptr, Size() + 1); + auto *result = tmp.Storage(); + auto *local = Storage(); + + index = Size() - index; + for ( int i = 0; i < index; i++ ) + result[i] = local[i]; + result[index] = value; + for ( int i = index; i <= _last; i++ ) + result[i + 1] = local[i]; + + return tmp; + } + + Shape Erase(int index) const + { + Shape tmp(nullptr, Size() - 1); + auto *result = tmp.Storage(); + auto *local = Storage(); + + index = ToOffset(index); + for ( int i = 0; i < index; i++ ) + result[i] = local[i]; + for ( int i = index + 1; i <= _last; i++ ) + result[i - 1] = local[i]; + + return tmp; + } + + Shape Extract(int a, int b, int c) const { return Extract({a, b, c}); } + + Shape Extract(int a, int b, int c, int d) const { return Extract({a, b, c, d}); } + + Shape Extract(std::initializer_list axes) const + { + Shape tmp(nullptr, int(axes.size())); + auto *local = Storage(); + auto *result = tmp.Storage() + tmp.Size() - 1; + for ( auto axis : axes ) + { + int from = ToOffset(axis); + assert(from < Size()); + *result-- = local[from]; + } + return tmp; + } + + Shape Extract(const Shape &axes) const + { + Shape tmp(nullptr, axes.Size()); + auto *local = Storage(); + auto *result = tmp.Storage() + tmp.Size() - 1; + for ( int i = 0; i < axes.Size(); i++ ) + { + int from = ToOffset(axes[i]); + assert(from < Size()); + *result-- = local[from]; + } + return tmp; + } + + // Permute using 4-bit-per-axis mask with depth in the LSB + Shape Permute(uint32_t reverseAxisMask4b) const + { + int length = Size(); + if ( length == 0 ) return *this; + + Shape tmp(nullptr, length); + auto *local = Storage(); + auto *result = tmp.Storage(); + + while ( length-- ) + { + int from = ToOffset(reverseAxisMask4b & 0xF); + assert(from < Size()); + *result++ = local[from]; + reverseAxisMask4b = reverseAxisMask4b >> 4; + } + assert((tmp.Elements64() == Elements64()) && "Possible bad permute (volume differs)"); + return tmp; + } + + // Reverse permute using 4-bit-per-axis mask with depth in the LSB + Shape Unpermute(uint32_t reverseAxisMask4b) const + { + int length = Size(); + if ( length == 0 ) return *this; + + Shape tmp(nullptr, length); + auto *local = Storage(); + auto *result = tmp.Storage(); + + while ( length-- ) + { + int to = ToOffset(reverseAxisMask4b & 0xF); + assert(to < Size()); + result[to] = *local++; + reverseAxisMask4b = reverseAxisMask4b >> 4; + } + assert((tmp.Elements64() == Elements64()) && "Possible bad unpermute (volume differs)"); + return tmp; + } + + Shape Untranspose(TransposeType type) const + { + if ( IsNone(type) ) return *this; + + return Unpermute(uint32_t(type)); + } + + int Size() const { return _last + 1; } + + int Depth() const + { + assert(_last >= 0); + return At(0); + } + int Width() const + { + assert(_last >= 1); + return At(1); + } + int Height() const + { + assert(_last >= 2); + return At(2); + } + int Batch() const + { + assert(_last >= 3); + return At(3); + } + + template + Point2 WC() const + { + assert(Size() >= 2); + return Point2(TYPE(Width()), TYPE(Depth())); + } + + template + Point2 WH() const + { + assert(Size() >= 3); + return Point2(TYPE(Width()), TYPE(Height())); + } + + template + Point3 HWC() const + { + assert(Size() >= 3); + return Point3(TYPE(Height()), TYPE(Width()), TYPE(Depth())); + } + + int ElementsWH() const + { + int64_t result = int64_t(Width()) * Height(); + assert(result <= std::numeric_limits::max()); + return int(result); + } + + int Elements() const + { + int64_t result = Elements64(); + assert(result <= std::numeric_limits::max()); + return int(result); + } + + int64_t Elements64() const + { + int64_t result = 0; + if ( IsValid() ) + { + auto *local = Storage(); + result = local[0]; + for ( int i = 1; i <= _last; i++ ) + { + result *= local[i]; + } + } + return result; + } + + bool IsValid() const { return _last >= 0; } + + bool IsDynamic() const { return _dynamic; } + + bool IsEmpty() const + { + auto *local = Storage(); + for ( int i = 0; i <= _last; i++ ) + { + if ( local != 0 ) + { + return false; + } + } + return true; + } + + bool IsSubShapeOf(const Shape &other) const + { + if ( Size() > other.Size() ) + { + return false; + } + auto *bounds = other.Storage(); + auto *local = Storage(); + for ( int i = _last; i >= 0; i-- ) + { + if ( local[i] > bounds[i] ) + { + return false; + } + } + return true; + } + + template + int ToNHWC(TYPE *buffer, int length) const + { + auto *local = Storage() + _last; + for ( int i = 0; i < length; i++ ) + { + *buffer++ = TYPE(local[-i]); + } + return length; + } + + template + std::vector ToList() const + { + return std::vector(std::reverse_iterator(Storage() + Size()), std::reverse_iterator(Storage())); + } + + std::string ToString() const + { + std::string tmp; + tmp.reserve(16); + auto *local = Storage(); + for ( int i = _last; i >= 0; i-- ) + { + tmp += std::to_string(local[i]); + if ( i > 0 ) + { + tmp += ", "; + } + } + return tmp; + } + +private: + void Init(int size, int fillValue = 0) + { + assert(size > 0); + assert(size <= 127); + _last = size - 1; + _dynamic = (size > MAX_STATIC_AXES); + int32_t *p = _dynamic ? (_storage.ptr = new int32_t[size]) : _storage.axes; + std::fill_n(p, size, fillValue); + } + + void Free() + { + if ( _dynamic ) delete[] _storage.ptr; + _last = -1; // Becomes invalid + _dynamic = false; + _storage.ptr = nullptr; + } + + int32_t &At(int index) { return Storage()[index]; } + + const int32_t &At(int index) const { return Storage()[index]; } + + int32_t *Storage() { return _dynamic ? _storage.ptr : _storage.axes; } + + const int32_t *Storage() const { return _dynamic ? _storage.ptr : _storage.axes; } + + int ToOffset(int index) const { return (index < 0) ? (-index - 1) : (_last - index); } + + // Apply a function to the minimum number of axes between two shapes. + template + static Shape MinFunc(const Shape &a, const Shape &b) + { + int size = std::min(a.Size(), b.Size()); + Shape tmp(nullptr, size); + auto *pa = a.Storage(); + auto *pb = b.Storage(); + auto *result = tmp.Storage(); + for ( int i = 0; i < size; i++ ) + { + result[i] = FUNC()(pa[i], pb[i]); + } + return tmp; + } + + // Apply a function to the maximum number of axes between two shapes. For missing + // axes either take from the longest shape, or substitute a constant value. + template + static Shape MaxFunc(const Shape &a, const Shape &b) + { + bool a_longer = a.Size() >= b.Size(); + int length = a_longer ? a.Size() : b.Size(); + int shortest = a_longer ? b.Size() : a.Size(); + + Shape tmp(nullptr, length); + auto *pa = a.Storage(); + auto *pb = b.Storage(); + auto *result = tmp.Storage(); + + int i = 0; + for ( ; i < shortest; i++ ) + { + result[i] = FUNC()(pa[i], pb[i]); + } + for ( ; i < length; i++ ) + { + if ( TAKE_LONGEST ) + { + result[i] = a_longer ? pa[i] : pb[i]; + } + else + { + result[i] = a_longer ? FUNC()(pa[i], MISSING_VALUE) : FUNC()(MISSING_VALUE, pb[i]); + } + } + return tmp; + } + + // Apply a scalar function to all axes + template + static Shape ScalarFunc(const Shape &a, int value) + { + Shape tmp(nullptr, a.Size()); + auto *pa = a.Storage(); + auto *result = tmp.Storage(); + for ( int i = 0; i < a.Size(); i++ ) + { + result[i] = FUNC()(pa[i], value); + } + return tmp; + } + + // Proxy for pointer-to-functions + template + struct func_proxy + { + std::remove_reference_t operator()(T a, T b) const { return FUNC(a, b); } + }; + + template + struct op_wrap + { + T operator()(const T a, const T b) const + { + assert(b > 0); + return (a >= b) ? (a % b) : a; + } + }; + +public: + template + static Shape FromVector(const std::vector &from) + { + return from.empty() ? Shape() : Shape(from.data(), from.size()); + } + + static Shape PadAxes(const Shape &shape, int axes, int padValue) + { + return Shape(shape, std::max(axes, shape.Size()), padValue); + } + + static Shape Min(const Shape &a, const Shape &b) + { + return Shape::MinFunc>>(a, b); + } + + static Shape Max(const Shape &a, const Shape &b) + { + return Shape::MaxFunc>, true>(a, b); + } + + static Shape RoundAway(const Shape &a, const Shape &b) + { + return Shape::MinFunc>>(a, b); + } + + static Shape RoundZero(const Shape &a, const Shape &b) + { + return Shape::MinFunc>>(a, b); + } + + static Shape DivRoundUp(const Shape &a, const Shape &b) + { + return Shape::MinFunc>>(a, b); + } + + static Shape Wrap(const Shape &a, const Shape &b) { return Shape::MinFunc>(a, b); } + + static Shape GetStridesForShape(const Shape &shape, const Shape &granularity) + { + assert(granularity.Size() >= shape.Size()); + Shape tmp(nullptr, shape.Size()); + if ( shape.IsValid() ) + { + auto *gran = granularity.Storage(); + auto *from = shape.Storage(); + auto *result = tmp.Storage(); + result[0] = gran[0]; + for ( int i = 1; i <= shape._last; i++ ) + { + result[i] = ::RoundAway(result[i - 1] * from[i - 1], gran[i]); + } + } + return tmp; + } +}; diff --git a/ethosu/regor/common/transpose_type.cpp b/ethosu/regor/common/transpose_type.cpp new file mode 100644 index 00000000..2d7671a0 --- /dev/null +++ b/ethosu/regor/common/transpose_type.cpp @@ -0,0 +1,33 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "common/transpose_type.hpp" + +#include "common/logging.hpp" + +#include "common/bit_flags.hpp" + +BEGIN_ENUM_TABLE(TransposeType) + ADD_ENUM_NAME(NHWC) + ADD_ENUM_NAME(NWHC) + ADD_ENUM_NAME(NHCW) + ADD_ENUM_NAME(NWCH) + ADD_ENUM_NAME(NCHW) + ADD_ENUM_NAME(NCWH) + ADD_ENUM_NAME(None) +END_ENUM_TABLE() diff --git a/ethosu/regor/common/transpose_type.hpp b/ethosu/regor/common/transpose_type.hpp new file mode 100644 index 00000000..58e4a3b1 --- /dev/null +++ b/ethosu/regor/common/transpose_type.hpp @@ -0,0 +1,78 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once +#include "common/bit_flags.hpp" + +enum class TransposeType : uint32_t +{ + NHWC = 0x0123, + NWHC = 0x0213, + NHCW = 0x0132, + NWCH = 0x0231, + NCHW = 0x0312, + NCWH = 0x0321, + None = 0x01234567, +}; + +inline constexpr TransposeType operator>>(TransposeType type, uint32_t size) +{ + return TransposeType(uint32_t(type) >> size); +} + +inline bool IsNone(TransposeType type) +{ + for ( int p = 0; p < 32; p += 4 ) + { + if ( type == (TransposeType::None >> p) ) return true; + } + return false; +} + +// Reduce a 4D transpose mask to a 3D transpose mask (f.ex. 0x0123 -> 0x012) +inline TransposeType Reduce4To3(TransposeType type) +{ + if ( IsNone(type) ) + { + return TransposeType(0x012); + } + + switch ( type ) + { + case TransposeType::NHWC: + case TransposeType::NWHC: + case TransposeType::NHCW: + case TransposeType::NWCH: + case TransposeType::NCHW: + case TransposeType::NCWH: + { + int n = uint32_t(type >> 12) & 0xF; + assert(n == 0); + int h = uint32_t(type >> 8) & 0xF; + assert(h <= 3); + int w = uint32_t(type >> 4) & 0xF; + assert(w <= 3); + int c = uint32_t(type >> 0) & 0xF; + assert(c <= 3); + return TransposeType(((h - 1) << 8) | ((w - 1) << 4) | (c - 1)); + } + default: + assert(false && "Unsupported transpose type"); + return type; + } +} diff --git a/ethosu/regor/common/vector_span.hpp b/ethosu/regor/common/vector_span.hpp new file mode 100644 index 00000000..9c63a5d5 --- /dev/null +++ b/ethosu/regor/common/vector_span.hpp @@ -0,0 +1,77 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include + +/// +/// Mechanism for treating partial sections of a vector as complete sequences +/// to allow sub-array processing. +/// Does not track invalidation (source vector cannot change) +/// +template +class vector_span +{ + using iterator = typename std::vector::iterator; + using const_iterator = typename std::vector::const_iterator; + +private: + iterator _start; + iterator _end; + +public: + vector_span() = default; + + vector_span(std::vector &vec) + { + _start = vec.begin(); + _end = vec.end(); + } + + vector_span(std::vector &vec, int start, int end) + { + _start = vec.begin() + start; + _end = vec.begin() + end; + } + + vector_span(const std::vector &vec, int start, int end) + { + auto posStart = vec.begin() + start; + auto posEnd = vec.begin() + end; + // Use vec.erase to convert from const to non-const iterators (erases nothing!) + _start = const_cast &>(vec).erase(posStart, posStart); + _end = const_cast &>(vec).erase(posEnd, posEnd); + } + + TYPE &front() { return *_start; } + const TYPE &front() const { return *_start; } + + TYPE &back() { return *(_end - 1); } + const TYPE &back() const { return *(_end - 1); } + + // Iterate just the values + iterator begin() { return _start; } + iterator end() { return _end; } + const_iterator begin() const { return _start; } + const_iterator end() const { return _end; } + + int size() const { return int(std::distance(_start, _end)); } + + TYPE &operator[](int index) { return *(_start + index); } +}; diff --git a/ethosu/regor/compiler/attributes.cpp b/ethosu/regor/compiler/attributes.cpp new file mode 100644 index 00000000..009a01be --- /dev/null +++ b/ethosu/regor/compiler/attributes.cpp @@ -0,0 +1,66 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "attributes.hpp" + +#include "common/logging.hpp" + +namespace regor +{ + +#define REDUCED_HASH(hash) (hash & 0x000FFFFF) +#define CASE_MAKE_ATTR_INSTANCE(TYPE_) \ + case REDUCED_HASH(TypeHash::HASH): \ + return DynamicRef(TypeInfoOf::Get(true), TypeInfoOf::SharedNew()); + +DynamicRef CreateAttribute(uint32_t reducedHash) +{ + reducedHash = REDUCED_HASH(reducedHash); + switch ( reducedHash ) + { + CASE_MAKE_ATTR_INSTANCE(asr_attr_t); + CASE_MAKE_ATTR_INSTANCE(axis_attr_t); + CASE_MAKE_ATTR_INSTANCE(reshape_attr_t); + CASE_MAKE_ATTR_INSTANCE(clamp_attr_t); + CASE_MAKE_ATTR_INSTANCE(concat_attr_t); + CASE_MAKE_ATTR_INSTANCE(cond_attr_t); + CASE_MAKE_ATTR_INSTANCE(custom_attr_t); + CASE_MAKE_ATTR_INSTANCE(fft_attr_t); + CASE_MAKE_ATTR_INSTANCE(leaky_relu_attr_t); + CASE_MAKE_ATTR_INSTANCE(mul_attr_t); + CASE_MAKE_ATTR_INSTANCE(pack_unpack_attr_t); + CASE_MAKE_ATTR_INSTANCE(pooling_attr_t); + CASE_MAKE_ATTR_INSTANCE(rescale_attr_t); + CASE_MAKE_ATTR_INSTANCE(resize_attr_t); + CASE_MAKE_ATTR_INSTANCE(slice_attr_t); + CASE_MAKE_ATTR_INSTANCE(softmax_attr_t); + CASE_MAKE_ATTR_INSTANCE(strided_slice_attr_t); + CASE_MAKE_ATTR_INSTANCE(tile_attr_t); + CASE_MAKE_ATTR_INSTANCE(transpose_attr_t); + CASE_MAKE_ATTR_INSTANCE(transpose_conv2d_attr_t); + CASE_MAKE_ATTR_INSTANCE(while_attr_t); + default: + assert(false && "No attribute has this reduced hash"); + // Add a new XXX_attr_t struct to the header then + // insert a new case entry in the statement above + break; + }; + return {}; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/attributes.hpp b/ethosu/regor/compiler/attributes.hpp new file mode 100644 index 00000000..c7b4a897 --- /dev/null +++ b/ethosu/regor/compiler/attributes.hpp @@ -0,0 +1,259 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "common/data_type.hpp" +#include "common/dynamic_typing.hpp" +#include "common/numeric_util.hpp" +#include "common/shape.hpp" +#include "include/graphapi.hpp" + +#include +#include + +namespace regor +{ + +#define GRAPHAPI_FUNCTION_DETAIL_ONLY +#include "include/graphapi_attr.hpp" + +// MakeAttributeId places the type information in the lower 4-bits for the specific +// purpose of stripping it with a right shift. This allows us to compare fields +// of a different type ids. +#define ATTR_FIELD_ID(CLASS_, UNIQUE_) detail::MakeAttributeId(TypeHash::HASH, 0, UNIQUE_) +#define ATTR_FIELD(MEMBER_, UNIQUE_) \ + { \ + offsetof(thisclass_t, MEMBER_), ATTR_FIELD_ID(thisclass_t, UNIQUE_) >> 4, \ + REGOR_FIELD_TYPE(decltype(reinterpret_cast(0)->MEMBER_)) \ + } \ + , + +struct pooling_attr_t +{ + DataType accPrecision; + BEGIN_FIELD_TABLE(pooling_attr_t) + ATTR_FIELD(accPrecision, 0) + END_FIELD_TABLE() +}; + +struct axis_attr_t +{ + int32_t axis; + BEGIN_FIELD_TABLE(axis_attr_t) + ATTR_FIELD(axis, 0) + END_FIELD_TABLE() +}; + +struct reshape_attr_t +{ + Shape shape; + BEGIN_FIELD_TABLE(reshape_attr_t) + ATTR_FIELD(shape, 0) + END_FIELD_TABLE() +}; + +struct slice_attr_t +{ + Shape begin; + Shape size; + BEGIN_FIELD_TABLE(slice_attr_t) + ATTR_FIELD(begin, 0) + ATTR_FIELD(size, 1) + END_FIELD_TABLE() +}; + +struct resize_attr_t +{ + Point2i scaleX; // X/Y (x=numerator/y=denominator) + Point2i scaleY; // X/Y (x=numerator/y=denominator) + Point2i offset; + Point2i border; + tosa::ResizeMode mode; + BEGIN_FIELD_TABLE(resize_attr_t) + ATTR_FIELD(scaleX, 0) + ATTR_FIELD(scaleY, 1) + ATTR_FIELD(offset, 2) + ATTR_FIELD(border, 3) + ATTR_FIELD(mode, 4) + END_FIELD_TABLE() +}; + +struct clamp_attr_t +{ + double min; + double max; + BEGIN_FIELD_TABLE(clamp_attr_t) + ATTR_FIELD(min, 0) + ATTR_FIELD(max, 1) + END_FIELD_TABLE() +}; + +struct rescale_attr_t +{ + bool scale32; + bool double_round; + bool per_channel; + BEGIN_FIELD_TABLE(rescale_attr_t) + ATTR_FIELD(scale32, 0) + ATTR_FIELD(double_round, 1) + ATTR_FIELD(per_channel, 2) + END_FIELD_TABLE() +}; + +struct mul_attr_t +{ + int32_t shift; + BEGIN_FIELD_TABLE(mul_attr_t) + ATTR_FIELD(shift, 0) + END_FIELD_TABLE() +}; + +struct asr_attr_t +{ + bool round; + BEGIN_FIELD_TABLE(asr_attr_t) + ATTR_FIELD(round, 0) + END_FIELD_TABLE() +}; + +struct cond_attr_t +{ + std::string then_branch; + std::string else_branch; + BEGIN_FIELD_TABLE(cond_attr_t) + ATTR_FIELD(then_branch, 0) + ATTR_FIELD(else_branch, 1) + END_FIELD_TABLE() +}; + +struct while_attr_t +{ + std::string cond_branch; + std::string body_branch; + BEGIN_FIELD_TABLE(while_attr_t) + ATTR_FIELD(cond_branch, 0) + ATTR_FIELD(body_branch, 1) + END_FIELD_TABLE() +}; + +struct transpose_conv2d_attr_t +{ + Shape outShape; + BEGIN_FIELD_TABLE(transpose_conv2d_attr_t) + ATTR_FIELD(outShape, 0) + END_FIELD_TABLE() +}; + +struct transpose_attr_t +{ + Shape perm; + BEGIN_FIELD_TABLE(transpose_attr_t) + ATTR_FIELD(perm, 0) + END_FIELD_TABLE() +}; + +struct tile_attr_t +{ + Shape multiples; + BEGIN_FIELD_TABLE(tile_attr_t) + ATTR_FIELD(multiples, 0) + END_FIELD_TABLE() +}; + +struct fft_attr_t +{ + bool inverse; + BEGIN_FIELD_TABLE(fft_attr_t) + ATTR_FIELD(inverse, 0) + END_FIELD_TABLE() +}; + +struct custom_attr_t +{ + std::string name; + std::string domain; + BEGIN_FIELD_TABLE(custom_attr_t) + ATTR_FIELD(name, 0) + ATTR_FIELD(domain, 1) + END_FIELD_TABLE() +}; + +struct strided_slice_attr_t +{ + int begin_mask; + int end_mask; + int ellipsis_mask; + int new_axis_mask; + int shrink_axis_mask; + BEGIN_FIELD_TABLE(strided_slice_attr_t) + ATTR_FIELD(begin_mask, 0) + ATTR_FIELD(end_mask, 1) + ATTR_FIELD(ellipsis_mask, 2) + ATTR_FIELD(new_axis_mask, 3) + ATTR_FIELD(shrink_axis_mask, 4) + END_FIELD_TABLE() +}; + +struct tflite_resize_t +{ + bool alignCorners; + bool halfPixelCenters; + BEGIN_FIELD_TABLE(tflite_resize_t) + ATTR_FIELD(alignCorners, 0) + ATTR_FIELD(halfPixelCenters, 1) + END_FIELD_TABLE() +}; + +struct leaky_relu_attr_t +{ + float alpha; + BEGIN_FIELD_TABLE(leaky_relu_attr_t) + ATTR_FIELD(alpha, 0) + END_FIELD_TABLE() +}; + +struct softmax_attr_t +{ + float beta; + BEGIN_FIELD_TABLE(softmax_attr_t) + ATTR_FIELD(beta, 0) + END_FIELD_TABLE() +}; + +struct concat_attr_t +{ + int axis; + BEGIN_FIELD_TABLE(concat_attr_t) + ATTR_FIELD(axis, 0) + END_FIELD_TABLE() +}; + +struct pack_unpack_attr_t +{ + int axis; + BEGIN_FIELD_TABLE(pack_unpack_attr_t) + ATTR_FIELD(axis, 0) + END_FIELD_TABLE() +}; + +DynamicRef CreateAttribute(uint32_t reducedhash); + +} // namespace regor diff --git a/ethosu/regor/compiler/cascade_builder.cpp b/ethosu/regor/compiler/cascade_builder.cpp new file mode 100644 index 00000000..1d185f22 --- /dev/null +++ b/ethosu/regor/compiler/cascade_builder.cpp @@ -0,0 +1,399 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// #define LOG_TRACE_ENABLE TD_1 +#include "cascade_builder.hpp" + +#include "common/logging.hpp" + +#include "common/numeric_util.hpp" +#include "common/shape.hpp" +#include "op_type.hpp" +#include "scheduler.hpp" +#include "scheduler_operation.hpp" + +#include +#include +#include + +namespace regor +{ + +class BufferMap +{ + using Key = std::pair; + struct KeyHash + { + size_t operator()(const Key &k) const { return (k.first << 8) ^ k.second; } + }; + +private: + std::unordered_map _cache; + +public: + CascadeBuffer GetBuffer(SchedulerOperation *producer, SchedulerOperation *consumer, const Schedule *refSchedule) + { + auto key = Key(producer ? *producer : 0, consumer ? *consumer : 0); + auto pos = _cache.find(key); + if ( pos != _cache.end() ) + { + return pos->second; + } + + Shape bufferShape; + int bufferSize = 0; + // No cached buffer between these two SchedulerOperations + if ( consumer == nullptr ) + { + auto ofm = producer->OFM(); + // There are either no consumers or multiple consumers - FeatureMap needs to be stored in full + bufferShape = ofm->shape; + bufferSize = ofm->tensor->AllocationSizeBytes(); + } + else if ( producer == nullptr ) + { + auto ifm = consumer->IFM(consumer->PrimaryIfmIndex()); + // First Op in subgraph or cascade - FeatureMap needs to be stored in full + bufferShape = ifm->shape; + bufferSize = ifm->tensor->AllocationSizeBytes(); + } + else + { + auto ofm = producer->OFM(); + auto ifm = consumer->IFM(consumer->PrimaryIfmIndex()); + + if ( ofm->requireFullTensor || ifm->requireFullTensor ) + { + // FeatureMap needs to be stored in full + bufferShape = Shape::Max(ofm->shape, ifm->shape); + bufferSize = std::max(ofm->tensor->AllocationSizeBytes(), ifm->tensor->AllocationSizeBytes()); + } + else + { + // Use a rolling buffer + auto producerCost = refSchedule->Cost(producer); + auto consumerCost = refSchedule->Cost(consumer); + + bufferShape = RollingBufferShape(producerCost->stripe, consumerCost->stripeInput[0]); + bufferSize = DataTypeStorageSizeBytes(ofm->tensor->dataType, bufferShape.Elements()); + } + } + _cache.emplace(key, CascadeBuffer(bufferShape, bufferSize)); + + return CascadeBuffer(bufferShape, bufferSize); + } + + Shape RollingBufferShape(const Shape &producerStripeShape, const Shape &consumerStripeShape) + { + // Calculates the storage shape of the rolling buffer between two SchedulerOperations in a Cascade + int buffer_height = RoundAway(producerStripeShape.Height() + consumerStripeShape.Height(), consumerStripeShape.Height()); + // Rolling buffers have to conform to NHCWB16 alignment + return consumerStripeShape.With(-3, buffer_height).With(-1, RoundAway(producerStripeShape.Depth(), 16)); + } +}; + + +CascadeBuilder::CascadeBuilder(vector_span> ops, const std::unordered_map &nonLocalMemUsage, bool spilling) : + _ops(ops), _nonLocalMemUsage(nonLocalMemUsage) + +{ + _spilling = spilling; +} + + +void CascadeBuilder::BuildCascades(Schedule *refSchedule, Schedule *fallbackSchedule, Address guidingStagingLimit) +{ + BufferMap buffers; + SchedulerCostMap costs; + std::unordered_map cascadeMap; + + + LOG_TRACE1("Build Cascades for '{}' with limit of {} bytes\n", refSchedule->Name(), guidingStagingLimit); + // Peak memory usage so far - updated continuously, except where spilling makes this a hard limit + int peakStagingUsage = int(std::min(INT64_C(1) << 30, guidingStagingLimit)); + auto pos = _ops.begin(); + while ( pos != _ops.end() ) + { + SchedulerOperation *op = pos->get(); + if ( !op->IsNpuOp() ) + { + pos++; + continue; + } + + // Already processed this Op if it has a cost + if ( costs.find(*op) != costs.end() ) + { + pos++; + continue; + } + + auto fallbackCost = fallbackSchedule->Cost(op); + + SchedulerConnection *ifm = op->IFM(op->PrimaryIfmIndex()); + + // If Op is not a candidate for cascading - assign fallback cost + if ( !IsCascadable(op, ifm->tensor.get(), refSchedule->Cost(op)) ) + { + costs[*op] = std::make_unique(*fallbackCost); + if ( !_spilling ) + { + peakStagingUsage = std::max(EstimateBufferUsage(op, fallbackCost), peakStagingUsage); + } + pos++; + continue; + } + + // Propose a cascade starting with this Op + // Keep track of which Ops are in the proposed cascade as well as the best cascade so far + int cascadeStart = op->Index(); + std::vector opsInCascade = {op}; + std::vector opsInBestCascade = {op}; + + // Get the size of the weight buffer + int weightBufferSize = 0; + auto refCost = refSchedule->Cost(op); + if ( refCost->bufferedWeightTensor.tensor ) + { + weightBufferSize = refCost->bufferedWeightTensor.tensor->AllocationSizeBytes(); + } + + // The first IFM needs to be stored in full + int cascadeIFMSize = _spilling ? 0 : ifm->tensor->AllocationSizeBytes(); + + // Add non-local memory usage + cascadeIFMSize += NonLocalUsage(*op); + + // Sum of all intermediate cascade buffers (including weight buffers) + int cascadeBuffersSize = weightBufferSize; + + // Best cascade size - Initially it's the fallback cost of the first Op in the cascade + int bestCascadeSize = EstimateBufferUsage(op, fallbackCost); + + // Op is the producer of the OFM consumed by the next Op to consider + auto producer = op; + while ( true ) + { + auto &dependants = producer->OFM()->tensor->consumers; + + if ( dependants.size() != 1u ) + { + // producer is either the last Op in the schedule or the start of a branch + break; + } + + SchedulerOperation *currentOp = dependants[0]; + refCost = refSchedule->Cost(currentOp); + + auto currentIfm = currentOp->IFM(currentOp->PrimaryIfmIndex()); + + if ( costs.find(*currentOp) != costs.end() || (refCost == nullptr) || + !IsCascadable(currentOp, currentIfm->tensor.get(), refCost) || + producer->OFM()->shape != currentIfm->shape || currentIfm->requireFullTensor ) + { + // Current op has already been processed or cannot be cascaded + break; + } + if ( currentOp->Index() != producer->Index() + 1 ) + { + // Cascading is possible, but requires reordering of operations in the schedule, + // this is currently not supported + break; + } + + // Get the size of the FeatureMap buffers between current and neighbouring Ops + int opFullIfmSize = currentIfm->tensor->AllocationSizeBytes(); + int opFullOfmSize = currentOp->OFM()->tensor->AllocationSizeBytes(); + + auto bufferInfo = buffers.GetBuffer(producer, currentOp, refSchedule); + int ifmBufferSize = bufferInfo.sizeBytes; + + // Get the size of the weight buffer + int opWeightBuffer = 0; + if ( refCost->bufferedWeightTensor.tensor ) + { + opWeightBuffer = refCost->bufferedWeightTensor.tensor->AllocationSizeBytes(); + } + + // Calculate the uncascaded memory requirement for current Op + int uncascadedStagingUsage = opFullIfmSize + opFullOfmSize + NonLocalUsage(*currentOp); + + // Add current Op to cascade + opsInCascade.push_back(currentOp); + + // Increase the accumulated intermediate buffers in the cascade + cascadeBuffersSize += ifmBufferSize + opWeightBuffer; + + LOG_TRACE1("\tAppend '{0}:{1}' to cascade\n", currentOp->Index(), OpTypeToString(currentOp->Type())); + LOG_TRACE1("\t\tFull Primary IFM [{0}] bytes = {1}, Full OFM bytes [{2}] = {3}\n", + currentIfm->shape.ToString(), opFullIfmSize, currentOp->OFM()->shape.ToString(), opFullOfmSize); + LOG_TRACE1("\t\tCascade buffer bytes = {0} - [{1}]\n", cascadeBuffersSize, bufferInfo.shape.ToString()); + + if ( _spilling ) + { + if ( (uncascadedStagingUsage < peakStagingUsage) || (cascadeBuffersSize > peakStagingUsage) ) + { + // Cascade until an Op fits in its entirety or the accumulated buffers no longer fit + break; + } + else + { + opsInBestCascade = opsInCascade; + bestCascadeSize = cascadeBuffersSize; + } + } + else + { + // Calculate the total size of the current cascade + int cascadeSize = cascadeIFMSize + cascadeBuffersSize + opFullOfmSize; + + // Determine if current cascade is the best so far + if ( cascadeSize < bestCascadeSize ) + { + bestCascadeSize = cascadeSize; + opsInBestCascade = opsInCascade; + } + // Determine if cascading search should stop + if ( ((uncascadedStagingUsage < peakStagingUsage) && (bestCascadeSize < peakStagingUsage)) || + (cascadeIFMSize + cascadeBuffersSize) > bestCascadeSize ) + { + // Both the existing cascade and current Op fits + break; + } + } + + producer = currentOp; + } + + if ( opsInBestCascade.size() > 1 ) + { + // A cascade was created - assign cascade and ref_cost to all of the Ops + int cascadeEnd = cascadeStart + int(opsInBestCascade.size()) - 1; // Inclusive end + + std::unordered_map buffersInCascade; + SchedulerOperation *prevOp = nullptr; + for ( auto cascadedOp : opsInBestCascade ) + { + assert(cascadedOp->Index() <= cascadeEnd); + auto cascadedCost = std::make_unique(*refSchedule->Cost(cascadedOp)); + cascadedCost->cascade = cascadeEnd; + costs.emplace(*cascadedOp, std::move(cascadedCost)); + + if ( prevOp ) + { + auto const &buffer = buffers.GetBuffer(prevOp, cascadedOp, refSchedule); + buffersInCascade[*cascadedOp] = buffer; + } + + prevOp = cascadedOp; + } + + // Create a CascadeInfo for the cascade + cascadeMap.emplace(cascadeEnd, CascadeInfo(cascadeStart, cascadeEnd, bestCascadeSize, std::move(buffersInCascade))); + if ( !_spilling ) + { + // Update peak memory usage + peakStagingUsage = std::max(bestCascadeSize, peakStagingUsage); + } + } + else + { + // Assign fallback cost to the initial Op + costs.emplace(*op, std::make_unique(*fallbackCost)); + if ( !_spilling ) + { + peakStagingUsage = std::max(EstimateBufferUsage(op, fallbackCost), peakStagingUsage); + } + } + } + // Update costing and cascade information for the ref_schedule + refSchedule->UpdateCosts(costs); + refSchedule->cascades = std::move(cascadeMap); +} + + +bool CascadeBuilder::IsCascadable(const SchedulerOperation *op, SchedulerTensor *ifm, SchedulerOpInfo *cost) const +{ + OpType type = op->Type(); + + if ( ifm->srcTensor->IsConstant() ) + { + return false; + } + + // Cascadable operations currently need to support NHCWB16 + if ( ifm->needsLinearFormat ) + { + return false; + } + + if ( IsDma(op->Type()) ) + { + return false; + } + + if ( op->IsReordering() ) + { + LOG_TRACE1("Not cascading Transpose/Reverse"); + return false; + } + + return (cost->stripe.Height() < op->OFM()->shape.Height()) && (IsConvolution(type) || IsElementwise(type) || IsPooling(type)); +} + + +int CascadeBuilder::EstimateBufferUsage(SchedulerOperation *op, SchedulerOpInfo *) const +{ + // Estimate the RAM required for the Op if all FeatureMaps are in RAM + int size = NonLocalUsage(*op); + + for ( auto usage : {TensorUsage::IFM, TensorUsage::IFM1, TensorUsage::OFM} ) + { + SchedulerConnection *fm = IsOFM(usage) ? op->Output(usage) : op->TryInput(usage); + if ( !fm ) + { + continue; + } + + if ( fm->requireFullTensor ) + { + size += fm->tensor->AllocationSizeBytes(); + } + else + { + size += fm->PartialAllocationSizeBytes(); + size = RoundAway(size, 16); + } + } + + return size; +} + + +int CascadeBuilder::NonLocalUsage(UniqueId uid) const +{ + auto opPos = _nonLocalMemUsage.find(uid); + if ( opPos != _nonLocalMemUsage.end() ) + { + return opPos->second; + } + + return 0; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/cascade_builder.hpp b/ethosu/regor/compiler/cascade_builder.hpp new file mode 100644 index 00000000..df70ca4c --- /dev/null +++ b/ethosu/regor/compiler/cascade_builder.hpp @@ -0,0 +1,99 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "common/shape.hpp" +#include "common/vector_span.hpp" +#include "op_type.hpp" +#include "scheduler_operation.hpp" + +#include +#include +#include + +namespace regor +{ + +class Schedule; + +/// +/// Information about a cascade buffer +/// +struct CascadeBuffer +{ + Shape shape; + int sizeBytes = 0; + CascadeBuffer() = default; + CascadeBuffer(const CascadeBuffer &) = default; + CascadeBuffer(const Shape &s, int size) : shape(s), sizeBytes(size) {} + CascadeBuffer &operator=(const CascadeBuffer &) = default; +}; + + +/// +/// Information about a cascade within a schedule +/// +struct CascadeInfo +{ + int start = 0; + int end = 0; + int memUsage = 0; + std::unordered_map buffers; + + CascadeInfo() = default; + CascadeInfo(const CascadeInfo &) = default; + CascadeInfo(int start_, int end_, int memUsage_, std::unordered_map buffers_) + { + this->start = start_; + this->end = end_; + this->memUsage = memUsage_; + this->buffers = std::move(buffers_); + } + CascadeInfo &operator=(const CascadeInfo &) = default; +}; + + +class SchedulerOpInfo; + +/// +/// Cascade builder for lists of scheduler operations +/// +class CascadeBuilder +{ +private: + vector_span> _ops; + const std::unordered_map &_nonLocalMemUsage; + bool _spilling = false; + +public: + CascadeBuilder(vector_span> ops, + const std::unordered_map &nonLocalMemUsage, bool spilling); + +public: + void BuildCascades(Schedule *refSchedule, Schedule *fallbackSchedule, Address guidingStagingLimit); + +private: + bool IsCascadable(const SchedulerOperation *op, SchedulerTensor *ifm, SchedulerOpInfo *cost) const; + int EstimateBufferUsage(SchedulerOperation *op, SchedulerOpInfo *cost) const; + int NonLocalUsage(UniqueId uid) const; +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/compiler.cpp b/ethosu/regor/compiler/compiler.cpp new file mode 100644 index 00000000..a9c4237c --- /dev/null +++ b/ethosu/regor/compiler/compiler.cpp @@ -0,0 +1,455 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "compiler.hpp" + +#include "common/logging.hpp" + +#include "architecture/register_command_stream_generator.hpp" +#include "common/bit_flags.hpp" +#include "common/ini_reader.hpp" +#include "graph_optimiser.hpp" +#include "graph_packing.hpp" +#include "graph_validator.hpp" +#include "high_level_command_stream_generator.hpp" +#include "network_performance.hpp" +#include "raw_writer.hpp" +#include "scheduler_packing.hpp" +#include "tensor_allocator.hpp" +#include "tflite/custom_operator_ethosu.hpp" +#include "tflite/tflite_reader.hpp" +#include "tflite/tflite_writer.hpp" +#include "tosa/tosa_reader.hpp" + +BEGIN_ENUM_TABLE(regor::OutputFormat) + ADD_ENUM_NAME(None) + ADD_ENUM_NAME(TFLite) + ADD_ENUM_NAME(Raw) +END_ENUM_TABLE() + +namespace regor +{ + +Compiler::Compiler(std::unique_ptr &arch) +{ + _architecture = std::move(arch); +} + +Compiler::~Compiler() +{ + for ( auto blob : _output ) + { + blob->Release(); + } +} + +bool Compiler::ParseConfig(const char *text, size_t size) +{ + // Get architecture configuration + IniReader reader(text, size); + + std::string section; + while ( reader.Begin(section) ) + { + auto result = _architecture->ParseSection(section, &reader); + if ( result == IniParseResult::Error ) + { + SetLastError(fmt::format("Error parsing [{}]", section)); + return false; + } + reader.End(); + } + + return true; +} + + +bool Compiler::ParseOptions(const char *text, size_t size) +{ + Logging::Out.SetFilterMask(1 | 2 | 4); + + // Get compiler info + IniReader reader(text, size); + + std::string section; + while ( reader.Begin(section) ) + { + if ( section == "debug" ) + { + // Parse debug settings + std::string key; + while ( reader.Begin(key) ) + { + if ( key == "trace" ) + { + if ( reader.Get() ) + { + Logging::Out.SetFilterMask(Logging::Out.FilterMask() | 8 | 16 | 32); + } + } + reader.End(); + } + } + else if ( section == "compiler" ) + { + // Parse compiler options + std::string key; + while ( reader.Begin(key) ) + { + if ( key == "verbose_high_level_command_stream" ) + { + _compilerOptions.verboseHighLevelCommandStream = reader.Get(); + } + else if ( key == "verbose_register_command_stream" ) + { + _compilerOptions.verboseRegisterCommandStream = reader.Get(); + } + else if ( key == "enable_db" ) + { + _compilerOptions.debugDatabase = reader.Get(); + } + else if ( key == "perf_report" ) + { + _compilerOptions.perfReport = reader.Get(); + } + else if ( key == "output_format" ) + { + Flags flags; + flags.Parse(reader.Get()); + _compilerOptions.outputFormat = flags; + } + reader.End(); + } + } + else if ( section == "scheduler" ) + { + ParseSchedulerOptions(_schedulerOptions, reader); + } + else if ( section == "graph" ) + { + GraphOptimiser::ParseGraphOptimiserOptions(_graphOptimiserOptions, reader); + } + else + { + LOG_WARN("Skipping parsing of unrecognised options section '{}'\n", section); + } + + reader.End(); + } + + return true; +} + + +bool Compiler::LoadTosa(const void *input, size_t size) +{ + TosaReader::LoadGraphs(input, size, _builders); + return !_builders.empty(); +} + + +bool Compiler::LoadTflite(const void *input, size_t size) +{ + assert(input && size > 0); + + // Instantiate debug database if required early for TFLite + if ( _compilerOptions.debugDatabase != !!_optDb ) + _optDb = _compilerOptions.debugDatabase ? std::make_unique(&_Db) : nullptr; + + TfLiteReader::LoadGraphs(input, size, _graphs, _optDb.get()); + return !_graphs.empty(); +} + + +// Flatbuffer output blob +class RawBlob : public IRegorBlob +{ +private: + int _refCount = 1; + std::unique_ptr _buffer; + int64_t _offset; + int64_t _size; + +public: + RawBlob(std::unique_ptr buffer, int64_t offset, int64_t size) : + _buffer(std::move(buffer)), _offset(offset), _size(size) + { + } + + void AddRef() { _refCount++; } + + void Release() + { + if ( --_refCount == 0 ) + { + delete this; + } + } + + void *Map(int64_t &size) + { + size = _size; + return const_cast(_buffer.get() + _offset); + } + + void Unmap(void *) {} +}; + + +bool Compiler::Store(const std::vector> &graphs, + const std::vector> &tensorAddressMaps) +{ + if ( _compilerOptions.outputFormat == OutputFormat::Raw ) + { + RawWriter writer; + + // This will serialise multiple blobs + auto buffers = writer.Serialise(graphs, tensorAddressMaps); + + for ( auto &[buffer, bufferSize] : buffers ) + { + RawBlob *output = new RawBlob(std::move(buffer), 0, bufferSize); + _output.push_back(output); + } + } + else + { + TfLiteWriter writer; + int64_t offset; + size_t size; + + // This will only serialise one TFLite model + auto buffer = writer.Serialise(graphs, tensorAddressMaps, offset, size); + + RawBlob *output = new RawBlob(std::move(buffer), offset, int64_t(size)); + _output.push_back(output); + } + + return true; +} + + +bool Compiler::Compile() +{ + // Check that the configuration is okay to start compiling + std::string error; + if ( !_architecture->CheckConfiguration(error) ) + { + SetLastError(error); + return false; + } + + // If no graphs defined (nothing already loaded) then create a network from the graph builders + if ( _graphs.empty() ) + { + if ( _builders.empty() ) + { + SetLastError("No networks defined via GraphAPI"); + return false; + } + + // Instantiate debug database if required for GraphAPI + if ( _compilerOptions.debugDatabase != !!_optDb ) + _optDb = _compilerOptions.debugDatabase ? std::make_unique(&_Db) : nullptr; + + if ( !BuildNetwork(nullptr) ) // BuildNetworks sets error text + { + return false; + } + } + + // Is used to allocate all constant Npu tensors, in permanent storage + IncrementalLinearAllocator readOnlyAllocator("read-only NPU tensors"); + + // Compile each graph/subgraph separately + std::vector> newGraphs; + std::vector> tensorAddressMaps; + for ( auto &graph : _graphs ) + { + std::unordered_map tensorAddressMap; + auto newGraph = CompileGraph(graph, readOnlyAllocator, tensorAddressMap); + if ( !newGraph ) + { + return false; + } + newGraphs.push_back(std::move(newGraph)); + tensorAddressMaps.push_back(std::move(tensorAddressMap)); + } + + _optDb.reset(); + Store(newGraphs, tensorAddressMaps); + + _builders.clear(); + return true; +} + + +bool Compiler::BuildNetwork(const char *entryGraph) +{ + // Iterate through the builders committing their inputs/outputs to new + // graph objects. Any un-attached data will be dropped later. + for ( auto &builder : _builders ) + { + auto graph = std::make_unique( + builder.Name(), builder._inputs, builder._outputs, GraphNotation::GraphAPI, builder.SyntaxVersion()); + if ( entryGraph && (builder.Name() == entryGraph) ) + { + assert(!_entryPoint && "Entrypoint already set"); + _entryPoint = graph.get(); + } + _graphs.push_back(std::move(graph)); + } + + if ( _graphs.empty() ) + { + SetLastError("No graphs defined in network"); + return false; + } + + // Select first graph as entrypoint if none selected + if ( !_entryPoint ) + { + _entryPoint = _graphs.front().get(); + } + + // Clearing the builders will release anything that the client allocated + // but didn't use. Unconnected operators will get freed. + _builders.clear(); + return true; +} + +std::unique_ptr Compiler::CompileGraph(std::unique_ptr &graph, + IncrementalLinearAllocator &readOnlyAllocator, std::unordered_map &tensorAddressMap) +{ + // Validate the input graph semantics + if ( graph->Notation() == GraphNotation::GraphAPI ) + { + auto validator = GraphValidator::MakeGraphValidator(graph->Notation(), graph->SyntaxVersion(), this); + if ( validator == nullptr ) + { + LOG_WARN("Input graph {0} not validated (required for GraphAPI) syntax={1:X}\n", graph->Name(), graph->SyntaxVersion()); + return nullptr; + } + if ( !validator->Validate(graph.get()) ) + { + SetLastError(validator->GetErrorMsg()); + return nullptr; + } + } + + // Preprocess/optimise the graph + std::unique_ptr optimiser = GraphOptimiser::MakeGraphOptimiser( + graph->Notation(), _architecture.get(), _graphOptimiserOptions, _optDb.get()); + if ( optimiser ) + { + optimiser->Process(graph.get()); + } + + // Pack/linearise graph Operations into SchedulerOperations + SchedulerPacking packing(_architecture.get()); + auto scheduleOps = packing.Process(graph.get()); + + // Schedule the linearised operation sequence + Scheduler scheduler(_architecture.get(), _schedulerOptions, "graph", scheduleOps); + auto schedule = scheduler.Process(); + + scheduler.AllocateReadOnlyAddresses(schedule.get(), readOnlyAllocator); + + // Calculate full network performance + if ( _compilerOptions.perfReport ) + { + NetworkPerformance perf(_architecture.get(), scheduleOps); + _perfResult = perf.Measure(schedule.get(), _optDb.get()); + } + + // Get a new graph and NPU operations from the scheduled operations + std::vector>> npuOps; + std::unique_ptr newGraph = PackScheduleToGraph(npuOps, scheduleOps, tensorAddressMap, graph.get()); + +#ifndef NDEBUG + // Validate the output graph is NPU-only + Graph::TraverseGraphFromEnd(newGraph->Outputs(), + [&](Operation *op) -> bool + { + assert((op->Type() == OpType::CustomNpuOp) || (graph->Notation() != GraphNotation::GraphAPI)); + return true; + }); +#endif + + auto customOperatorBuilder = CustomOperatorBuilder(_architecture.get(), schedule.get()); + customOperatorBuilder.AllocateScratchTensors(tensorAddressMap); + + // Work over the NPU ops, generating code + for ( const auto &pair : npuOps ) + { + auto *graphOp = pair.first; + const auto *npuOp = pair.second.get(); + + // Generate HLCS + auto hlcsGenerator = HLCStreamGenerator(); + auto highLevelCommandStream = hlcsGenerator.GenerateCommandStream(npuOp, schedule.get(), _compilerOptions.verboseHighLevelCommandStream); + + // Generate LLCS for output + std::vector> cmdRanges; + auto registerCommandStream = _architecture->RegisterCommandStreamGenerator()->GenerateCommandStream( + highLevelCommandStream, &cmdRanges, _compilerOptions.verboseRegisterCommandStream); + + if ( registerCommandStream.empty() ) + { + SetLastError("Failed to generate command stream"); + return nullptr; + } + + if ( _optDb ) + { + int streamId = _optDb->AddStream(); + for ( auto const &cmd : cmdRanges ) + { + _optDb->AddCommand(std::get<0>(cmd), streamId, std::get<2>(cmd) - 1); + } + } + + customOperatorBuilder.Serialise(graphOp, npuOp, registerCommandStream); + } + + return newGraph; +} + +GraphApi::IGraphBuilder *Compiler::CreateGraph(const char *name) +{ + auto pos = std::find_if(_builders.begin(), _builders.end(), [&](auto &b) { return b.Name() == name; }); + if ( pos != _builders.end() ) + { + return &(*pos); + } + + _builders.emplace_back(name); + return &_builders.back(); +} + +Graph *Compiler::GetGraph(const char *name) +{ + auto pos = std::find_if(_graphs.begin(), _graphs.end(), [&](auto &b) { return b->Name() == name; }); + if ( pos != _graphs.end() ) + { + return pos->get(); + } + return nullptr; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/compiler.hpp b/ethosu/regor/compiler/compiler.hpp new file mode 100644 index 00000000..4ed81bb1 --- /dev/null +++ b/ethosu/regor/compiler/compiler.hpp @@ -0,0 +1,133 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "architecture/architecture.hpp" +#include "database.hpp" +#include "graph.hpp" +#include "graph_builder.hpp" +#include "graph_optimiser.hpp" +#include "include/regor_interface.hpp" +#include "network_performance.hpp" +#include "scheduler.hpp" +#include "tensor_allocator.hpp" + +#include +#include +#include + +#include "include/regor.h" + +namespace regor +{ + +enum class OutputFormat : uint16_t +{ + None, + TFLite, + Raw, +}; + +/// +/// Compilation options +/// +struct CompilerOptions +{ + bool verboseHighLevelCommandStream = false; + bool verboseRegisterCommandStream = false; + bool debugDatabase = false; + bool perfReport = true; + OutputFormat outputFormat = OutputFormat::TFLite; +}; + +/// +/// Regor top level compiler context (could just become Context) +/// +class Compiler : public IRegorReporting +{ +private: + SchedulerOptions _schedulerOptions; + CompilerOptions _compilerOptions; + GraphOptimiserOptions _graphOptimiserOptions; + std::unique_ptr _architecture; + std::string _lastError; + std::deque _output; + PerformanceResult _perfResult; + class Database _Db; + std::unique_ptr _optDb; + + Graph *_entryPoint = nullptr; + std::vector> _graphs; + std::list _builders; + +public: + void *userApiArg = nullptr; + +public: + Compiler() = delete; + Compiler(const Compiler &) = delete; + Compiler(std::unique_ptr &arch); + ~Compiler(); + +public: + bool ParseConfig(const char *text, size_t size); + bool ParseOptions(const char *text, size_t size); + + bool LoadTosa(const void *input, size_t size); + bool LoadTflite(const void *input, size_t size); + bool Store(const std::vector> &graphs, + const std::vector> &tensorAddressMaps); + + bool Compile(); + + [[nodiscard]] IRegorBlob *Output() + { + if ( _output.empty() ) + { + return nullptr; + } + + auto blob = _output.front(); + _output.pop_front(); + return blob; + } + + void SetLastError(const char *message) { _lastError = message; } + void SetLastError(const std::string &message) { _lastError = message; } + Architecture *Arch() { return _architecture.get(); } + const std::string &LastError() const { return _lastError; } + const PerformanceResult &LastPerfResult() const { return _perfResult; } + // From IRegorReporting + IDatabase *OptimiserDatabase() { return &_Db; } + + GraphApi::IGraphBuilder *CreateGraph(const char *name); + Graph *GetGraph(const char *name); + +private: + bool BuildNetwork(const char *entryGraph); + + std::unique_ptr CompileGraph(std::unique_ptr &graph, IncrementalLinearAllocator &readOnlyAllocator, + std::unordered_map &tensorAddressMap); + + Compiler &operator=(const Compiler &) = delete; +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/database.hpp b/ethosu/regor/compiler/database.hpp new file mode 100644 index 00000000..f9d5d74f --- /dev/null +++ b/ethosu/regor/compiler/database.hpp @@ -0,0 +1,208 @@ +// +// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "include/regor_database.hpp" + +#include +#include + +namespace regor +{ + + + +/// +/// Base database implementation +/// +class Database : public IDatabase +{ +protected: + struct DataRow + { + int uniqueId = 0; + std::unique_ptr fields; + + DataRow() = default; + + DataRow(DataRow &&other) noexcept : fields(std::move(other.fields)) { uniqueId = other.uniqueId; } + + DataRow &operator=(DataRow &&other) noexcept + { + this->uniqueId = other.uniqueId; + this->fields = std::move(other.fields); + return *this; + } + }; + + struct DataTable + { + std::string name; + std::vector rows; + std::vector columnNames; + int columns = 0; + bool isIndexed = true; + DataTable(const char *tableName) : name(tableName) {} + }; + + + struct RowIterator : public IRowIterator + { + private: + const std::string *_fields; + int _uniqueId = 0; + int _columns = 0; + int _index = -1; + + public: + RowIterator(int uniqueId, const std::string *fields, int columns) : + _fields(fields), _uniqueId(uniqueId), _columns(columns) + { + } + std::string Value() override { return _fields[_index]; } + int Id() override { return _uniqueId; } + int Column() override { return _index; } + bool Next() override + { + _index++; + return _index < _columns; + } + void Release() override { delete this; } + }; + + struct TableIterator : public ITableIterator + { + private: + std::vector &_tables; + int _index = -1; + + public: + TableIterator(std::vector &tables) : _tables(tables) {} + + public: + std::string Name() override + { + assert(_index >= 0); + return _tables[_index].name; + } + int Rows() override + { + assert(_index >= 0); + return int(_tables[_index].rows.size()); + } + int Columns() override + { + assert(_index >= 0); + return _tables[_index].columns; + } + IRowIterator *ColumnNames() override + { + return new RowIterator( + _tables[_index].isIndexed ? 1 : 0, _tables[_index].columnNames.data(), _tables[_index].columns); + } + IRowIterator *Row(int row) override + { + const auto &entry = _tables[_index].rows[row]; + return new RowIterator(entry.uniqueId, entry.fields.get(), _tables[_index].columns); + } + bool Next() override + { + _index++; + return _index < int(_tables.size()); + } + void Release() override { delete this; } + }; + + std::vector _tables; + +public: + int AddTable(const char *name, bool isIndexed = true) + { + _tables.emplace_back(name); + _tables.back().isIndexed = isIndexed; + return int(_tables.size()) - 1; + } + + void AddColumns(int tableId, std::initializer_list names) + { + assert(tableId >= 0 && tableId < int(_tables.size())); + DataTable *table = &_tables[tableId]; + table->columnNames.insert(table->columnNames.end(), names.begin(), names.end()); + table->columns = int(table->columnNames.size()); + } + + void AddColumns(int tableId, std::vector names) + { + assert(tableId >= 0 && tableId < int(_tables.size())); + DataTable *table = &_tables[tableId]; + table->columnNames.insert(table->columnNames.end(), names.begin(), names.end()); + table->columns = int(table->columnNames.size()); + } + + int AddRow(int tableId, int uniqueId, std::initializer_list values) + { + assert(tableId >= 0 && tableId < int(_tables.size())); + DataTable *table = &_tables[tableId]; + DataRow tmp; + tmp.uniqueId = uniqueId; + tmp.fields = std::unique_ptr(new std::string[table->columns]); + auto pos = values.begin(); + for ( int i = 0; i < table->columns && pos != values.end(); i++, pos++ ) + { + tmp.fields[i] = *pos; + } + + table->rows.push_back(std::move(tmp)); + return int(table->rows.size()) - 1; + } + + int AddRow(int tableId, int uniqueId, std::vector values) + { + assert(tableId >= 0 && tableId < int(_tables.size())); + DataTable *table = &_tables[tableId]; + DataRow tmp; + tmp.uniqueId = uniqueId; + tmp.fields = std::unique_ptr(new std::string[table->columns]); + auto pos = values.begin(); + for ( int i = 0; i < table->columns && pos != values.end(); i++, pos++ ) + { + tmp.fields[i] = *pos; + } + + table->rows.push_back(std::move(tmp)); + return int(table->rows.size()) - 1; + } + + void SetField(int tableId, int row, int column, const std::string &value) + { + assert(tableId >= 0 && tableId < int(_tables.size())); + DataTable *table = &_tables[tableId]; + assert(row < int(table->rows.size())); + assert(column < table->columns); + table->rows[row].fields[column] = value; + } + + // From IDatabase + ITableIterator *Tables() override { return new TableIterator(_tables); } +}; + + +} // namespace regor diff --git a/ethosu/regor/compiler/faststorage_allocator.cpp b/ethosu/regor/compiler/faststorage_allocator.cpp new file mode 100644 index 00000000..98afd7c9 --- /dev/null +++ b/ethosu/regor/compiler/faststorage_allocator.cpp @@ -0,0 +1,403 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "compiler/faststorage_allocator.hpp" + +#include "architecture/architecture.hpp" +#include "common/vector_span.hpp" +#include "live_range.hpp" +#include "scheduler.hpp" + +#include +#include +#include +#include +#include + +namespace regor +{ + +// FastStorageComponentAllocator +FastStorageComponentAllocator::FastStorageComponentAllocator(std::vector *baseMemUsage, + std::vector *maxMemUsage, int stagingLimit, std::unordered_map *elementAccessLrs) : + _baseMemUsage(baseMemUsage), + _maxMemUsage(maxMemUsage), _stagingLimit(stagingLimit), _elementAccessLrs(elementAccessLrs) +{ +} + +// Allocates live ranges. Outputs a vector that gives for each live range if it should be evicted or kept +void FastStorageComponentAllocator::Allocate(vector_span &lrs, std::vector &evicted) +{ + int sz = lrs.size(); + evicted.resize(sz); + _lrs = lrs; + _evicted = &evicted; + _currEvicted.resize(sz); + _bestScore = 0; + AllocateExhaustive(0, 0); + _evicted = nullptr; +} + +// Exhaustive, recursive search, starting at the given index +void FastStorageComponentAllocator::AllocateExhaustive(int ix, int score) +{ + if ( ix >= _lrs.size() ) + { + // Check if score is better (more access is better) + if ( score > _bestScore || _bestScore == 0 ) + { + // Best so far, remember this solution + _bestScore = score; + *_evicted = _currEvicted; + } + return; + } + + auto lr = _lrs[ix]; + for ( int t = lr->startTime; t <= lr->endTime; ++t ) + { + assert((*_baseMemUsage)[t] <= (*_maxMemUsage)[t]); + } + // Current peak usage during this live range + int baseUsage = *std::max_element(&(*_baseMemUsage)[lr->startTime], &(*_baseMemUsage)[lr->endTime + 1]); + bool canFit = baseUsage + lr->size <= _stagingLimit; + bool alwaysFits = canFit; + if ( canFit ) + { + // Keep current lr + int maxUsage = *std::max_element(&(*_maxMemUsage)[lr->startTime], &(*_maxMemUsage)[lr->endTime + 1]); + // If alwaysFits is true, lr can be kept regardless of the allocation of the other lrs + alwaysFits = maxUsage <= _stagingLimit; + _currEvicted[ix] = false; + int lrScore = 0; + auto entry = _elementAccessLrs->find(lr); + if ( entry != _elementAccessLrs->end() ) + { + lrScore = entry->second; + } + UpdateMemUsage(_baseMemUsage, lr, true); + + AllocateExhaustive(ix + 1, score + lrScore); + UpdateMemUsage(_baseMemUsage, lr, false); + } + if ( !alwaysFits ) + { + // Evict current lr + _currEvicted[ix] = true; + UpdateMemUsage(_maxMemUsage, lr, false); + AllocateExhaustive(ix + 1, score); + UpdateMemUsage(_maxMemUsage, lr, true); + } +} + +void FastStorageComponentAllocator::UpdateMemUsage(std::vector *memUsage, LiveRange *lr, bool increase) +{ + for ( int t = lr->startTime; t <= lr->endTime; ++t ) + { + (*memUsage)[t] += increase ? lr->size : -lr->size; + assert((*memUsage)[t] >= 0); + } +} + + +// FastStorageAllocator + +void FastStorageAllocator::AllocateFeatureMaps(const std::vector> &schedOps, + Schedule *schedule, const MemArea &fastStorage, Address stagingLimit) +{ + _stagingLimit = int(std::min(INT64_C(1) << 30, stagingLimit)); + // Force all OFMs to fast-storage (except final outputs) + // _scratchedFms contains the original tensor MemArea (not fast storage) that tensors will be evicted to + _scratchedFms.clear(); + for ( auto &schedOp : schedOps ) + { + if ( !schedOp->IsNpuOp() ) + { + continue; + } + + auto cost = schedule->Cost(schedOp.get()); + if ( cost->cascade == 0 ) + { + SchedulerConnection *ofm = schedOp->OFM(); + if ( !ofm->tensor->consumers.empty() && !ofm->tensor->hasCPUReaders && !ofm->tensor->isGraphOutput && + _scratchedFms.count(ofm->tensor.get()) == 0 ) + { + _scratchedFms[ofm->tensor.get()] = ofm->tensor->memArea; + ofm->tensor->memArea = fastStorage; + } + } + } + + auto lrGraph = LiveRangeGraph(); + lrGraph.ExtractLiveRangesFromCascades(schedOps, schedule, fastStorage, true); + // Populate time-array with memory used by live ranges + int maxUsage; + _maxMemUsage = lrGraph.GetTemporalMemoryUsage(maxUsage); + + if ( maxUsage <= _stagingLimit ) + { + // All feature maps fit in fast storage + return; + } + // Not all feature maps fit in fast storage + _baseMemUsage = _maxMemUsage; + std::vector lrs; + for ( auto lr : lrGraph.LiveRanges() ) + { + for ( auto &tens : lr->tensors ) + { + if ( _scratchedFms.count(tens) ) + { + lrs.push_back(lr.get()); + for ( int t = lr->startTime; t <= lr->endTime; ++t ) + { + _baseMemUsage[t] -= lr->size; + } + break; + } + } + } + + // Perform a first sweep to keep/evict live ranges that are obviously too big + std::vector canFitLrs; + for ( auto lr : lrs ) + { + // Highest memory usage in this live range + int baseUsage = *std::max_element(&_baseMemUsage[lr->startTime], &_baseMemUsage[lr->endTime + 1]); + + if ( baseUsage + lr->size > _stagingLimit ) + { + // Cannot possibly fit + Evict(lr); + } + else + { + canFitLrs.push_back(lr); + } + } + std::vector competingLrs; + for ( auto lr : canFitLrs ) + { + maxUsage = *std::max_element(&_maxMemUsage[lr->startTime], &_maxMemUsage[lr->endTime + 1]); + if ( maxUsage <= _stagingLimit ) + { + // Definitively fits without impacting other feature maps + Keep(lr); + } + else + { + competingLrs.push_back(lr); + } + } + // For the remaining live ranges a choice must be made which to keep and which to evict. + // Divide the live ranges in connected components and do a search for each component + int sz = int(competingLrs.size()); + if ( sz == 0 ) + { + ElementwiseSanitizer(schedOps, schedule, fastStorage, lrGraph); + return; + } + + // For every competing live range accumulate the total element access for the ranges. + // Include all tensors access for a range - both read and write access. + // A live range that is used within a cascade is given the highest score possible. + // The reason is for cascaded elementwise operators where the other ifm (the cascade buffer) + // is already in fast storage. + // A live range with higher element access is considered more important to keep in + // in fast storage. + std::unordered_map elementAccessLrs; + for ( auto lr : competingLrs ) + { + bool lrUsedWithinCascade = false; + int64_t access = 0; + for ( auto tens : lr->tensors ) + { + // Look at readers + for ( auto cons : tens->consumers ) + { + auto *ifm = cons->IFM(0); + auto *ifm2 = cons->TryIFM(1); + auto consCost = schedule->Cost(cons); + + CascadeInfo *cascadeInfo = + consCost == nullptr || consCost->cascade == 0 ? nullptr : &schedule->cascades[consCost->cascade]; + + if ( cascadeInfo && cons->Index() > cascadeInfo->start ) + { + lrUsedWithinCascade = true; + break; + } + + if ( ifm->tensor->srcTensor == tens->srcTensor && consCost ) + { + access += consCost->elementAccess.ifmRead[0]; + } + else if ( ifm2 && ifm2->tensor->srcTensor == tens->srcTensor && consCost ) + { + access += consCost->elementAccess.ifmRead[1]; + } + } + if ( !lrUsedWithinCascade ) + { + // Look at writers + for ( auto prod : tens->producers ) + { + auto cost = schedule->Cost(prod); + if ( cost == nullptr && prod->Parent() ) + { + // Most likely a fused LUT, use cost from primary op + cost = schedule->Cost(prod->Parent()); + } + if ( cost ) + { + access += cost->elementAccess.ofmWrite; + } + } + } + else + { + access = FastStorageComponentAllocator::MAX_ACCESS_SIZE; + } + } + elementAccessLrs[lr] = access; + } + + int start = 0; + int startTime = competingLrs[0]->startTime; + int endTime = competingLrs[0]->endTime; + FastStorageComponentAllocator componentAllocator(&_baseMemUsage, &_maxMemUsage, _stagingLimit, &elementAccessLrs); + + // Calculate and allocate connected components + for ( int i = 1; i < sz; ++i ) + { + auto lr = competingLrs[i]; + if ( lr->startTime <= endTime && i - start <= MAX_COMPONENT_SIZE ) + { + // Add to existing component + startTime = std::min(startTime, lr->startTime); + endTime = std::max(endTime, lr->endTime); + } + else + { + // lr is start of a new component; allocate the current component + vector_span span(competingLrs, start, i); + AllocateComponent(componentAllocator, span); + // Start a new component + start = i; + startTime = lr->startTime; + endTime = lr->endTime; + } + } + vector_span span(competingLrs, start, sz); + AllocateComponent(componentAllocator, span); + ElementwiseSanitizer(schedOps, schedule, fastStorage, lrGraph); +} + +// Allocates a connected range of live ranges +void FastStorageAllocator::AllocateComponent(FastStorageComponentAllocator &allocator, vector_span &lrs) +{ + std::vector evicted; + int sz = lrs.size(); + allocator.Allocate(lrs, evicted); + assert(sz == int(evicted.size())); + for ( int i = 0; i < sz; ++i ) + { + if ( evicted[i] ) + { + Evict(lrs[i]); + } + else + { + Keep(lrs[i]); + } + } +} + +void FastStorageAllocator::ElementwiseSanitizer(const std::vector> &schedOps, + Schedule *schedule, const MemArea &fastStorage, LiveRangeGraph &lrGraph) + +{ + // For now - enforce that both ifm's should be in the same memory for elementwise + for ( auto &schedOp : schedOps ) + { + if ( !schedOp->IsNpuOp() ) + { + continue; + } + + if ( IsBinaryElementwise(schedOp->_type) ) + { + auto *ifm = schedOp->IFM(0); + auto *ifm2 = schedOp->TryIFM(1); + auto consCost = schedule->Cost(schedOp.get()); + + CascadeInfo *cascadeInfo = + consCost == nullptr || consCost->cascade == 0 ? nullptr : &schedule->cascades[consCost->cascade]; + + if ( cascadeInfo && schedOp->Index() > cascadeInfo->start ) + // Within cascade there is nothing to do, since cascade buffer is in fast storage + continue; + + if ( ifm2 && ifm2->tensor->memArea != ifm->tensor->memArea ) + { + // One ifm not in fast storage + if ( ifm->tensor->memArea == fastStorage && !ifm2->tensor->IsConstant() ) + { + // Ifm in fast storage and ifm2 is not a constant + auto lr = lrGraph.GetOrCreateRange(ifm->tensor.get()); + Evict(lr); + } + + if ( ifm2->tensor->memArea == fastStorage && !ifm->tensor->IsConstant() ) + { + // Ifm2 in fast storage and ifm is not a constant + auto lr = lrGraph.GetOrCreateRange(ifm2->tensor.get()); + Evict(lr); + } + } + } + } +} + +void FastStorageAllocator::Evict(LiveRange *lr) +{ + for ( int t = lr->startTime; t <= lr->endTime; ++t ) + { + _maxMemUsage[t] -= lr->size; + } + for ( auto &tens : lr->tensors ) + { + auto entry = _scratchedFms.find(tens); + if ( entry != _scratchedFms.end() ) + { + tens->memArea = entry->second; + } + } +} + +void FastStorageAllocator::Keep(LiveRange *lr) +{ + for ( int t = lr->startTime; t <= lr->endTime; ++t ) + { + _baseMemUsage[t] += lr->size; + assert(_baseMemUsage[t] <= _stagingLimit); + } +} + +} // namespace regor diff --git a/ethosu/regor/compiler/faststorage_allocator.hpp b/ethosu/regor/compiler/faststorage_allocator.hpp new file mode 100644 index 00000000..19f512c6 --- /dev/null +++ b/ethosu/regor/compiler/faststorage_allocator.hpp @@ -0,0 +1,98 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/architecture.hpp" +#include "common/vector_span.hpp" +#include "live_range.hpp" +#include "scheduler.hpp" + +#include +#include +#include +#include +#include + +namespace regor +{ + +// Allocates a connected set of live ranges to fast storage. +// +// The allocator attempts to maximize the sum of the sizes of all feature maps +// that are placed in fast storage. +class FastStorageComponentAllocator +{ +private: + // Memory usage per timestamp when no lrs are kept + std::vector *_baseMemUsage = nullptr; + // Memory usage per timestamp when all lrs are kept + std::vector *_maxMemUsage = nullptr; + + Address _stagingLimit; + vector_span _lrs; + // Indices of evicted lrs in best solution + std::vector *_evicted = nullptr; + // Indices of evicted lrs in current solution + std::vector _currEvicted; + int _bestScore = 0; + std::unordered_map *_elementAccessLrs = nullptr; + // Use default seed (which is well-defined) to guarantee reproducible results + std::mt19937 _rng; + +public: + static constexpr int MAX_ACCESS_SIZE = std::numeric_limits::max(); + + FastStorageComponentAllocator(std::vector *baseMemUsage, std::vector *maxMemUsage, int stagingLimit, + std::unordered_map *elementAccessLrs); + // Allocates live ranges. Outputs a vector that gives for each live range if it should be evicted or kept + void Allocate(vector_span &lrs, std::vector &evicted); + +private: + // Exhaustive, recursive search, starting at the given index + void AllocateExhaustive(int ix, int score); + void UpdateMemUsage(std::vector *memUsage, LiveRange *lr, bool increase); +}; + +// Allocates feature maps to fast storage +class FastStorageAllocator +{ +private: + static constexpr int64_t MAX_COMPONENT_SIZE = 20; + // Remembers feature map's memory before it was allocated to fast storage + std::unordered_map _scratchedFms; + // Memory usage with all feature maps in fast storage + std::vector _maxMemUsage; + // Memory usage without feature maps that still need to be allocated + std::vector _baseMemUsage; + int _stagingLimit = 0; + +public: + void AllocateFeatureMaps(const std::vector> &schedOps, Schedule *schedule, + const MemArea &fastStorage, Address stagingLimit); + +private: + // Allocates a connected range of live ranges + void AllocateComponent(FastStorageComponentAllocator &allocator, vector_span &lrs); + void ElementwiseSanitizer(const std::vector> &schedOps, Schedule *schedule, + const MemArea &fastStorage, LiveRangeGraph &lrGraph); + void Evict(LiveRange *lr); + void Keep(LiveRange *lr); +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/graph.hpp b/ethosu/regor/compiler/graph.hpp new file mode 100644 index 00000000..05755cc8 --- /dev/null +++ b/ethosu/regor/compiler/graph.hpp @@ -0,0 +1,172 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "operation.hpp" +#include "tensor.hpp" + +#include +#include +#include +#include +#include + +namespace regor +{ + +enum class GraphNotation +{ + Invalid = 0, + GraphAPI = 1, + TFLite = 2, +}; + +/// +/// Top level Neural Network Graph (NNG) +/// +class Graph +{ +private: + std::string _name; + std::vector> _inputs; + std::vector> _outputs; + GraphNotation _notation = GraphNotation::Invalid; + uint32_t _syntaxVersion = 0; + const void *_passthrough = nullptr; // Original flatbuffer description of this model (if it was loaded from one) + std::vector _opsInScheduledOrder; + +public: + Graph() = delete; + + Graph(GraphNotation nt) : _notation(nt) {} + + Graph(const std::string &name, std::vector> inputs, std::vector> outputs, GraphNotation nt, uint32_t version) : + _name(name), _inputs(std::move(inputs)), _outputs(std::move(outputs)), _notation(nt), _syntaxVersion(version) + { + } + + ~Graph() + { + _notation = GraphNotation::Invalid; + std::vector operations; + GetAllOperations(operations); + for ( auto operation : operations ) + { + operation->Disconnect(); + } + } + +public: + const std::string &Name() const { return _name; } + uint32_t SyntaxVersion() const { return _syntaxVersion; } + + const std::vector> &Inputs() const { return _inputs; } + const std::vector> &Outputs() const { return _outputs; } + + void AddInput(const std::shared_ptr &input) { _inputs.push_back(input); } + void AddOutput(const std::shared_ptr &output) { _outputs.push_back(output); } + + bool IsInput(const Tensor *tensor) const + { + return std::find_if(_inputs.begin(), _inputs.end(), + [&](const std::shared_ptr &input) { return input.get() == tensor; }) != _inputs.end(); + } + bool IsOutput(const Tensor *tensor) const + { + return std::find_if(_outputs.begin(), _outputs.end(), + [&](const std::shared_ptr &output) { return output.get() == tensor; }) != _outputs.end(); + } + + GraphNotation Notation() const + { + assert(_notation != GraphNotation::Invalid); + return _notation; + } + + uint32_t Version() const { return _syntaxVersion; } + + const void *Passthrough() const { return _passthrough; } + void SetPassthrough(const void *passthrough) { _passthrough = passthrough; } + + // Finds all operations which precede a graph output and adds them to the vector in execution order + void GetAllOperations(std::vector &operations) const + { + TraverseGraphFromEnd(Outputs(), + [&](Operation *op) -> bool + { + operations.push_back(op); + return true; + }); + } + + // Get all operations in the graph, in scheduled order + const std::vector &ScheduledOrder() const { return _opsInScheduledOrder; }; + + void SetScheduledOrder(std::vector operations) { _opsInScheduledOrder = std::move(operations); } + + template + static void TraverseGraphFromEnd(const std::vector> &from, OPFUNC opFunc) + { + struct Entry + { + bool done; + Operation *op; + }; + std::unordered_set visited; + std::stack stack; + + for ( const auto &tensor : from ) + { + for ( const auto &op : tensor->Writers() ) + { + stack.push(Entry{false, op.get()}); + } + } + + while ( !stack.empty() ) + { + Entry entry = stack.top(); + stack.pop(); + if ( entry.done ) + { + if ( !opFunc(entry.op) ) + { + return; + } + } + else if ( visited.count(entry.op) == 0 ) + { + visited.insert(entry.op); + stack.push(Entry{true, entry.op}); + for ( const auto &pair : entry.op->Inputs().pairs() ) + { + for ( const auto &op : pair.second.tensor->Writers() ) + { + if ( visited.count(op.get()) == 0 ) + { + stack.push(Entry{false, op.get()}); + } + } + } + } + } + } +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/graph_builder.cpp b/ethosu/regor/compiler/graph_builder.cpp new file mode 100644 index 00000000..ca9f2d02 --- /dev/null +++ b/ethosu/regor/compiler/graph_builder.cpp @@ -0,0 +1,530 @@ +// +// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "graph_builder.hpp" + +#include "attributes.hpp" +#include "common/numeric_util.hpp" +#include "graph.hpp" +#include "operation.hpp" + +#include +#include +#include + + +namespace regor +{ + +TensorUsage GraphAPIUsageToTensorUsage(GraphApi::GraphTensorUsage usage) +{ + return regor::TensorUsage(usage); // currently 1:1 mapping required +} + +namespace +{ + +// clang-format off +static constexpr std::pair s_aTosaMapping[] = { + {tosa::Op::ARGMAX, OpType::ArgMax}, + {tosa::Op::AVG_POOL2D, OpType::AvgPool}, + {tosa::Op::CONV2D, OpType::Conv2D}, + {tosa::Op::CONV3D, OpType::Conv3D}, + {tosa::Op::DEPTHWISE_CONV2D, OpType::DepthwiseConv2DBias}, + {tosa::Op::FULLY_CONNECTED, OpType::FullyConnected}, + {tosa::Op::MATMUL, OpType::MatMul}, + {tosa::Op::MAX_POOL2D, OpType::MaxPool}, + {tosa::Op::TRANSPOSE_CONV2D, OpType::TransposeConv2D}, + {tosa::Op::CLAMP, OpType::Clamp}, + {tosa::Op::SIGMOID, OpType::Sigmoid}, + {tosa::Op::TANH, OpType::Tanh}, + {tosa::Op::ADD, OpType::Add}, + {tosa::Op::ARITHMETIC_RIGHT_SHIFT, OpType::Asr}, + {tosa::Op::BITWISE_AND, OpType::And}, + {tosa::Op::BITWISE_OR, OpType::Or}, + {tosa::Op::BITWISE_XOR, OpType::Xor}, + {tosa::Op::INTDIV, OpType::Div}, + {tosa::Op::LOGICAL_AND, OpType::LogicalAnd}, + {tosa::Op::LOGICAL_LEFT_SHIFT, OpType::SHL}, + {tosa::Op::LOGICAL_RIGHT_SHIFT, OpType::SHR}, + {tosa::Op::LOGICAL_OR, OpType::LogicalOr}, + {tosa::Op::LOGICAL_XOR, OpType::LogicalXor}, + {tosa::Op::MAXIMUM, OpType::Maximum}, + {tosa::Op::MINIMUM, OpType::Minimum}, + {tosa::Op::MUL, OpType::Mul}, + {tosa::Op::POW, OpType::Pow}, + {tosa::Op::SUB, OpType::Sub}, + {tosa::Op::TABLE, OpType::LUT}, + {tosa::Op::ABS, OpType::Abs}, + {tosa::Op::BITWISE_NOT, OpType::Not}, + {tosa::Op::CEIL, OpType::Ceil}, + {tosa::Op::CLZ, OpType::CLZ}, + {tosa::Op::EXP, OpType::Exp}, + {tosa::Op::FLOOR, OpType::Floor}, + {tosa::Op::LOG, OpType::Log}, + {tosa::Op::LOGICAL_NOT, OpType::LogicalNot}, + {tosa::Op::NEGATE, OpType::Neg}, + {tosa::Op::RECIPROCAL, OpType::Reciprocal}, + {tosa::Op::RSQRT, OpType::Rsqrt}, + {tosa::Op::SELECT, OpType::Select}, + {tosa::Op::EQUAL, OpType::Equal}, + {tosa::Op::GREATER, OpType::Greater}, + {tosa::Op::GREATER_EQUAL, OpType::GreaterEqual}, + {tosa::Op::REDUCE_ANY, OpType::ReduceAny}, + {tosa::Op::REDUCE_ALL, OpType::ReduceAll}, + {tosa::Op::REDUCE_MAX, OpType::ReduceMax}, + {tosa::Op::REDUCE_MIN, OpType::ReduceMin}, + {tosa::Op::REDUCE_PRODUCT, OpType::ReduceProduct}, + {tosa::Op::REDUCE_SUM, OpType::ReduceSum}, + {tosa::Op::CONCAT, OpType::Concat}, + {tosa::Op::PAD, OpType::Pad}, + {tosa::Op::RESHAPE, OpType::Reshape}, + {tosa::Op::REVERSE, OpType::Reverse}, + {tosa::Op::SLICE, OpType::Slice}, + {tosa::Op::TILE, OpType::Tile}, + {tosa::Op::TRANSPOSE, OpType::Transpose}, + {tosa::Op::GATHER, OpType::Gather}, + {tosa::Op::SCATTER, OpType::Scatter}, + {tosa::Op::RESIZE, OpType::Resize}, + {tosa::Op::CAST, OpType::Cast}, + {tosa::Op::RESCALE, OpType::Rescale}, + {tosa::Op::IDENTITY, OpType::Identity}, + {tosa::Op::CUSTOM, OpType::Custom}, + {tosa::Op::COND_IF, OpType::If}, + {tosa::Op::WHILE_LOOP, OpType::While}, + //{tosa::Op::FFT2D, OpType::CurrentlyUnsupported}, + //{tosa::Op::RFFT2D, OpType::CurrentlyUnsupported}, + //{tosa::Op::ERF, OpType::CurrentlyUnsupported}, + //{tosa::Op::DIM, OpType::CurrentlyUnsupported}, +}; + +static constexpr std::pair s_aTypeMapping[] = { + {GraphApi::GraphDataType::Bool8, DataType::Bool8}, + {GraphApi::GraphDataType::Int4Packed8, DataType::Int4Packed8}, + {GraphApi::GraphDataType::Int8, DataType::Int8}, + {GraphApi::GraphDataType::Int16, DataType::Int16}, + {GraphApi::GraphDataType::Int32, DataType::Int32}, + {GraphApi::GraphDataType::Int48, DataType::Int48}, + {GraphApi::GraphDataType::Int64, DataType::Int64}, + {GraphApi::GraphDataType::UInt8, DataType::UInt8}, + {GraphApi::GraphDataType::UInt16, DataType::UInt16}, + {GraphApi::GraphDataType::UInt32, DataType::UInt32}, + {GraphApi::GraphDataType::UInt48, DataType::UInt48}, + {GraphApi::GraphDataType::UInt64, DataType::UInt64}, + {GraphApi::GraphDataType::BFloat16, DataType::BFloat16}, + {GraphApi::GraphDataType::Float16, DataType::Float16}, + {GraphApi::GraphDataType::Float32, DataType::Float32}, +}; +// clang-format on + +template +constexpr bool is_sorted(const std::pair (&list)[SIZE]) +{ + A v = list[0].first; + for ( size_t i = 1; i < SIZE; i++ ) + { + if ( list[i].first < v ) return false; + v = list[i].first; + } + return true; +} + +static_assert(is_sorted(s_aTosaMapping), "TOSA mapping must be sorted"); + +bool map_tosa_op(tosa::Op op, regor::OpType &tosaOp) +{ + auto pos = std::equal_range(std::begin(s_aTosaMapping), std::end(s_aTosaMapping), + std::pair(op, {}), [](const auto &a, const auto &b) { return a.first < b.first; }); + if ( pos.first == std::end(s_aTosaMapping) ) + { + return false; + } + + tosaOp = pos.first->second; + return true; +} + +static_assert(is_sorted(s_aTypeMapping), "Type mapping must be sorted"); + +bool map_data_type(GraphApi::GraphDataType type, regor::DataType &out) +{ + auto pos = std::equal_range(std::begin(s_aTypeMapping), std::end(s_aTypeMapping), + std::pair(type, {}), + [](const auto &a, const auto &b) { return a.first < b.first; }); + if ( pos.first == std::end(s_aTypeMapping) ) + { + return false; + } + + out = pos.first->second; + return true; +} + +} // namespace + + +GraphBuilder::GraphBuilder(const std::string &name) : _graphName(name) +{ +} + +GraphBuilder::~GraphBuilder() +{ + FreeUnconnected(); +} + +bool GraphBuilder::RequireSyntaxVersion(uint32_t version, int32_t level) +{ + _syntaxVersion = version | uint32_t(level); + + if ( _syntaxVersion > (GraphApi::VERSION_TOSA_0_60 | GraphApi::PROFILE_BASELINE) ) // 0.60.Baseline + { + return false; + } + + return true; +} + +GraphApi::GraphOperation *GraphBuilder::CreateOp(tosa::Op tosaType, const GraphKernel *kernel) +{ + OpType type = OpType::None; + if ( !map_tosa_op(tosaType, type) ) + { + return nullptr; + } + + auto op = std::make_shared(type); + if ( kernel ) + { + op->SetKernel(std::make_unique(kernel)); + } + else + { + op->SetKernel(std::make_unique(Point2i(1, 1), Point2i(1, 1), Point2i(1, 1))); + } + _operations.push_back(op); + + return op.get(); +} + + +struct GraphBuilderBuffer : public Buffer, public GraphApi::GraphBuffer +{ + template + GraphBuilderBuffer(int sizeBytes, const TYPE *p, bool alias) : Buffer(sizeBytes, p, alias) + { + } +}; + + +GraphApi::GraphBuffer *GraphBuilder::CreateBuffer(size_t sizeBytes, GraphApi::BufferMapping mapping, const void *initialData) +{ + auto buffer = std::make_shared( + int(std::clamp(sizeBytes, 0, unsigned(std::numeric_limits::max()))), + reinterpret_cast(initialData), (mapping == BufferMapping::Alias)); + _buffers.push_back(buffer); + return buffer.get(); +} + +GraphApi::GraphTensor *GraphBuilder::CreateTensor( + const char *name, const GraphShape &shape, GraphTensorLayout layout, GraphDataType dataType, GraphBuffer *buffer) +{ + DataType type; + if ( !map_data_type(dataType, type) ) + { + return nullptr; + } + + auto tensor = std::make_shared(name, type); + tensor->SetStorageShape(Shape(shape.axisNHWC, int(shape.count))); + // TODO: Handle external tensor format specification - tensor->SetStorageLayout(layout); + if ( buffer ) + { + assert(uintptr_t(buffer) % alignof(GraphBuilderBuffer) == 0); + auto graphBuffer = static_cast(buffer)->shared_from_this(); + int reqBytes = DataTypeStorageSizeBytes(type, tensor->StorageShape().Elements()); + assert(layout == GraphTensorLayout::Linear); + UNUSED(layout); + assert(reqBytes <= graphBuffer->Size()); + if ( reqBytes > graphBuffer->Size() ) + { + return nullptr; + } + tensor->SetBuffer(graphBuffer); + } + _tensors.push_back(tensor); + return tensor.get(); +} + +void GraphBuilder::AddInput(GraphTensor *graphTensor) +{ + auto tensor = static_cast(graphTensor); + _inputs.push_back(tensor->shared_from_this()); +} + +void GraphBuilder::AddOutput(GraphTensor *graphTensor) +{ + auto tensor = static_cast(graphTensor); + _outputs.push_back(tensor->shared_from_this()); +} + +void GraphBuilder::AddInput(GraphOperation *graphOp, GraphTensorUsage usage, GraphTensor *graphTensor) +{ + auto op = static_cast(graphOp); + // TODO check cross graph contamination - assert( std::find(_operations.begin(), _operations.end(), op) != + // _operations.end() ); + auto tensor = static_cast(graphTensor); + auto tmp = GraphAPIUsageToTensorUsage(usage); + int count = op->CountInputs(tmp); + op->ConnectInput(MakeTensorUsage(tmp, count), tensor->shared_from_this()).Set(Quantization::Unit()); +} + +void GraphBuilder::AddOutput(GraphOperation *graphOp, GraphTensorUsage usage, GraphTensor *graphTensor) +{ + auto op = static_cast(graphOp); + // TODO check cross graph contamination - assert( std::find(_operations.begin(), _operations.end(), op) != + // _operations.end() ); + auto tensor = static_cast(graphTensor); + auto tmp = GraphAPIUsageToTensorUsage(usage); + int count = op->CountOutputs(tmp); + op->ConnectOutput(MakeTensorUsage(tmp, count), tensor->shared_from_this()).Set(Quantization::Unit()); +} + +namespace +{ + +const FieldInfo *FindField(const TypeInfo *info, uint32_t id) +{ + assert(info); + size_t length; + const FieldInfo *table = info->Fields(length); + for ( size_t i = 0; i < length; i++ ) + { + if ( table[i].id == id ) return &table[i]; + } + return nullptr; +} + +template +void WriteField(void *p, const TYPE &value) +{ + assert(p); + *reinterpret_cast(p) = value; +} + +template +bool ConvertToType(void *p, [[maybe_unused]] uint8_t destType, const TYPE &value) +{ + if ( destType == FieldTypeId::TYPEID ) + { + WriteField(p, value); + return true; + } + return false; +} + +template<> +bool ConvertToType(void *p, uint8_t destType, const int32_t &value) +{ + assert(p); + switch ( destType ) + { + case FieldTypeId::TYPEID: + [[fallthrough]]; + case FieldTypeId::TYPEID: + WriteField(p, uint8_t(value)); + break; + case FieldTypeId::TYPEID: + [[fallthrough]]; + case FieldTypeId::TYPEID: + WriteField(p, uint16_t(value)); + break; + case FieldTypeId::TYPEID: + [[fallthrough]]; + case FieldTypeId::TYPEID: + WriteField(p, uint32_t(value)); + break; + default: + assert(false); + return false; + break; + } + return true; +} + +template<> +bool ConvertToType(void *p, uint8_t destType, const double &value) +{ + assert(p); + switch ( destType ) + { + case FieldTypeId::TYPEID: + WriteField(p, float(value)); + break; + case FieldTypeId::TYPEID: + WriteField(p, value); + break; + default: + assert(false); + return false; + break; + } + return true; +} + +template +bool WriteAttributeValue(Operation *op, GraphApi::OpAttr attrId, const TYPE &value) +{ + DynamicRef *attr = op->AttributeByKey(uint32_t(attrId) >> 12); + if ( attr ) + { + const auto *field = FindField(attr->Info(), (uint32_t(attrId) >> 4) & 0x0FFFFFFF); + if ( field ) + { + void *to = reinterpret_cast(attr->Instance()) + field->offset; + return ConvertToType(to, field->typeId, value); + } + } + return false; +} + +} // namespace + +bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, bool value) +{ + auto op = static_cast(graphOp); + if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_bool ) + { + assert(false && "Attribute type not bool"); + return false; + } + + return WriteAttributeValue(op, attr, value); +} + +bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, int32_t value) +{ + auto op = static_cast(graphOp); + if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_int32 ) + { + assert(false && "Attribute type not int32"); + return false; + } + return WriteAttributeValue(op, attr, value); +} + +bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, double value) +{ + auto op = static_cast(graphOp); + if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_double ) + { + assert(false && "Attribute type not double"); + return false; + } + return WriteAttributeValue(op, attr, value); +} + +bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const GraphApi::GraphShape &value) +{ + auto op = static_cast(graphOp); + if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_GraphShape ) + { + assert(false && "Attribute type not Shape"); + return false; + } + + Shape shape(value.axisNHWC, size_t(value.count)); + return WriteAttributeValue(op, attr, shape); +} + +bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const GraphApi::Point2 &value) +{ + auto op = static_cast(graphOp); + if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_Point2 ) + { + assert(false && "Attribute type not Point2"); + return false; + } + + Point2i xy(value.x, value.y); + return WriteAttributeValue(op, attr, xy); +} + +bool GraphBuilder::Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const char *value) +{ + auto op = static_cast(graphOp); + if ( (unsigned(attr) & 0xF) != GraphApi::GRAPHAPI_TYPECODE_string ) + { + assert(false && "Attribute type not string"); + return false; + } + + std::string str(value); + return WriteAttributeValue(op, attr, str); +} + +void GraphBuilder::SetZeroPoint(GraphOperation *graphOp, GraphTensorUsage tensorUsage, double zeroPoint) +{ + auto op = static_cast(graphOp); + auto usage = GraphAPIUsageToTensorUsage(tensorUsage); + auto *conn = (usage & regor::TensorUsage::TypeMask) == regor::TensorUsage::OFM ? op->Output(usage) : op->Input(usage); + if ( conn ) + { + conn->quantization.zeroPoints = {int64_t(zeroPoint)}; + } +} + +void GraphBuilder::SetAxisOrder(GraphTensor *graphTensor, GraphApi::AxisOrder order) +{ + auto tensor = static_cast(graphTensor); + tensor->SetAxisOrder(regor::AxisOrder(order)); +} + +void GraphBuilder::SetAxisStrides([[maybe_unused]] GraphTensor *graphTensor, [[maybe_unused]] const GraphApi::GraphShape *axisStrides) +{ + assert(axisStrides == nullptr && "Not currently implemented"); +} + + +void GraphBuilder::FreeUnconnected() +{ + try + { + // In case somebody added self-supporting graph fragments + std::unordered_set connected; + Graph::TraverseGraphFromEnd(_outputs, + [&](Operation *op) -> bool + { + connected.insert(op); + return true; + }); + for ( auto &op : _operations ) + { + if ( !connected.count(op.get()) ) + { + op->Disconnect(); + } + } + } + catch ( std::bad_weak_ptr & ) + { + // ignored + } +} + + +} // namespace regor diff --git a/ethosu/regor/compiler/graph_builder.hpp b/ethosu/regor/compiler/graph_builder.hpp new file mode 100644 index 00000000..ec67499e --- /dev/null +++ b/ethosu/regor/compiler/graph_builder.hpp @@ -0,0 +1,99 @@ +// +// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/buffer_view.hpp" +#include "include/graphapi.hpp" +#include "include/graphapi_tosa_types.hpp" +#include "tensor_properties.hpp" + +#include +#include +#include + +namespace regor +{ + +class Compiler; +class Operation; +class Tensor; + +/// +/// Graph Builder implementation +/// +class GraphBuilder : public GraphApi::IGraphBuilder +{ + friend class Compiler; + using GraphTensor = GraphApi::GraphTensor; + using GraphShape = GraphApi::GraphShape; + using GraphKernel = GraphApi::GraphKernel; + using GraphOperation = GraphApi::GraphOperation; + using GraphTensorUsage = GraphApi::GraphTensorUsage; + using GraphDataType = GraphApi::GraphDataType; + using GraphBuffer = GraphApi::GraphBuffer; + using BufferMapping = GraphApi::BufferMapping; + using GraphTensorLayout = GraphApi::GraphTensorLayout; + +protected: + std::string _graphName; + uint32_t _syntaxVersion = 0; + std::vector> _operations; + std::vector> _tensors; + std::vector> _inputs; + std::vector> _outputs; + std::vector> _buffers; + +public: + GraphBuilder(const std::string &name); + ~GraphBuilder(); + +public: + // Inherited via IGraphBuilder + bool RequireSyntaxVersion(uint32_t version, int32_t level) override; + GraphOperation *CreateOp(tosa::Op opType, const GraphKernel *kernel) override; + GraphBuffer *CreateBuffer(size_t sizeBytes, BufferMapping mapping, const void *initialData) override; + GraphTensor *CreateTensor(const char *name, const GraphShape &shape, GraphTensorLayout layout, + GraphDataType dataType, GraphBuffer *buffer) override; + // Set graph inputs/outputs + void AddInput(GraphTensor *graphTensor) override; + void AddOutput(GraphTensor *graphTensor) override; + // Connect operator inputs/outputs + void AddInput(GraphOperation *graphOp, GraphTensorUsage usage, GraphTensor *graphTensor) override; + void AddOutput(GraphOperation *graphOp, GraphTensorUsage usage, GraphTensor *graphTensor) override; + // Object attribute and properties + bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, bool value) override; + bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, int32_t value) override; + bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, double value) override; + bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const GraphApi::GraphShape &value) override; + bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const GraphApi::Point2 &value) override; + bool Set(GraphOperation *graphOp, GraphApi::OpAttr attr, const char *value) override; + void SetZeroPoint(GraphOperation *op, GraphTensorUsage usage, double zeroPoint) override; + void SetAxisOrder(GraphTensor *graphTensor, GraphApi::AxisOrder order) override; + void SetAxisStrides(GraphTensor *graphTensor, const GraphApi::GraphShape *axisStrides) override; + // Utility + const std::string &Name() const { return _graphName; } + uint32_t SyntaxVersion() const { return _syntaxVersion; } + +private: + void FreeUnconnected(); +}; + +TensorUsage GraphAPIUsageToTensorUsage(GraphApi::GraphTensorUsage usage); + +} // namespace regor diff --git a/ethosu/regor/compiler/graph_optimiser.cpp b/ethosu/regor/compiler/graph_optimiser.cpp new file mode 100644 index 00000000..c3407b8f --- /dev/null +++ b/ethosu/regor/compiler/graph_optimiser.cpp @@ -0,0 +1,341 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "graph_optimiser.hpp" + +#include "common/logging.hpp" + +#include "architecture/architecture.hpp" +#include "graph.hpp" +#include "graphir_optimiser.hpp" +#include "op_type.hpp" +#include "operation.hpp" +#include "tensor.hpp" +#include "tflite_graph_optimiser.hpp" + +#include +#include +#include +#include +#include +#include +#include + +#include "include/regor.h" + +namespace regor +{ + +std::unique_ptr GraphOptimiser::MakeGraphOptimiser( + GraphNotation notation, Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db) +{ + switch ( notation ) + { + case GraphNotation::TFLite: + return std::unique_ptr(std::make_unique(arch, options, db)); + + case GraphNotation::GraphAPI: + return std::unique_ptr(std::make_unique(arch, options, db)); + + default: + LOG_ERROR("Invalid graph notation"); + assert(false); + } + + return {}; +} + +// Some debug functions +#if LOG_TRACE1_ON +Operation *GraphOptimiser::VisitOperatorLog(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + if ( GraphOptimiser::Options().verboseGraph ) + { + LOG_TRACE1("Rewrite operator visits: {0} (@{1})", OpTypeToString(operation->Type()), static_cast(operation)); + auto *ifmConn = operation->Input(TensorUsage::IFM0); + LOG_TRACE1(" -- IFM shape: [{0}] read shape: [{2}] offset [{1}]", + (ifmConn == nullptr ? "" : ifmConn->shape.ToString()), (ifmConn == nullptr ? "" : ifmConn->slice.offset.ToString()), + (ifmConn == nullptr ? "" : ifmConn->slice.shape.ToString())); + + auto idx = 1; + auto usage = MakeTensorUsage(TensorUsage::IFM, 1); + ifmConn = operation->Input(usage); + while ( ifmConn != nullptr ) + { + LOG_TRACE1(", [{0}] read shape: [{2}] offset [{1}]", ifmConn->shape.ToString(), + ifmConn->slice.offset.ToString(), ifmConn->slice.shape.ToString()); + usage = MakeTensorUsage(TensorUsage::IFM, ++idx); + ifmConn = operation->Input(usage); + } + auto *ofmConn = operation->Output(TensorUsage::OFM); + LOG_TRACE1(" - OFM shape: [{0}] write shape: [{2}] offset [{1}]\n", + (ofmConn == nullptr ? "" : ofmConn->shape.ToString()), (ofmConn == nullptr ? "" : ofmConn->slice.offset.ToString()), + (ofmConn == nullptr ? "" : ofmConn->slice.shape.ToString())); + } + return operation; +} + +Tensor *GraphOptimiser::VisitTensorLog(Graph *const graph, Tensor *const tensor) +{ + UNUSED(graph); + if ( GraphOptimiser::Options().verboseGraph ) + { + LOG_TRACE1("Rewrite tensor visits: {0} (@{1}) -- Tensor shape: [{2}]\n", tensor->Name(), + static_cast(tensor), tensor->StorageShape().ToString()); + } + return tensor; +} +#endif + +Operation *GraphOptimiser::RecordOperation(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + if ( _db ) + { + // TODO: implement ext key tracking for TOSA Networks. + _db->SourceOp(operation); + } + return operation; +} + +Operation *GraphOptimiser::RecordOptimisation(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + // Remaining ops probably reference themselves + if ( _db ) + { + _db->AddOptimised(operation, operation); + } + return operation; +} + +void GraphOptimiser::RecordOptimisation(const Operation *operation, const Operation *op) +{ + if ( _db ) + { + _db->AddOptimised(operation, op); + } +} + +void GraphOptimiser::PrintGraph(const Graph *graph, const std::string &label) const +{ + if ( graph != nullptr ) + { + if ( !label.empty() ) + { + LOG_PRINT("\n[ {0} ]\n", label); + } + std::vector ops; + graph->GetAllOperations(ops); + auto idx = 0; + for ( const auto &op : ops ) + { + OpType type = op->Type(); + // TODO: This uses the OFM tensor name to identify the operator + std::string name = op->OFM() ? op->OFM()->Name() : ""; + LOG_PRINT("{0:<5} {1:<20} {2:<30}\n", idx, OpTypeToString(type), name); + ++idx; + } + LOG_PRINT("\n"); + } +} + +void GraphOptimiser::PrintQuantization(const Graph *graph, const std::string &label) const +{ + if ( graph != nullptr ) + { + if ( !label.empty() ) + { + LOG_PRINT("\n[ {0} ]\n", label); + } + std::vector ops; + graph->GetAllOperations(ops); + auto op_idx = 0; + for ( const auto &op : ops ) + { + OpType type = op->Type(); + std::string name = op->OFM() ? op->OFM()->Name() : ""; + LOG_PRINT("{0} {1} {2}\n", op_idx, OpTypeToString(type), name); + + const ordered_map &inputs = op->Inputs(); + auto input_idx = 0; + for ( const auto &v : inputs ) + { + const auto &tens = v.tensor; + std::string quantization_string = v.quantization.ToString(); + LOG_PRINT(" {0} {1:02} {2} {3} {4}\n", "Input", input_idx, DataTypeToString(tens->Type()), + quantization_string, tens->Name()); + input_idx++; + } + ++op_idx; + } + LOG_PRINT("\n"); + } +} + +void GraphOptimiser::Process(Graph *graph) +{ + if ( _options.verboseGraph ) + { + PrintGraph(graph, "Before Graph Optimisation"); + } + OptimiseGraph(graph); + if ( _options.verboseGraph ) + { + PrintGraph(graph, "After Graph Optimization"); + } + if ( _options.verboseQuantization ) + { + PrintQuantization(graph, "Graph With Tensor Quantization"); + } +} + +void GraphOptimiser::ParseGraphOptimiserOptions(GraphOptimiserOptions &opt, IniReader &reader) +{ + // Parse debug settings + std::string key; + while ( reader.Begin(key) ) + { + if ( key == "verbose" ) + { + opt.verboseGraph = reader.Get(); + } + if ( key == "verbose_quantization" ) + { + opt.verboseQuantization = reader.Get(); + } + + reader.End(); + } +} + +OptimiserDatabase::OptimiserDatabase(Database *db) : _db(db) +{ + _sourceTable = _db->AddTable("source"); + _optTable = _db->AddTable("optimised"); + _cmdTable = _db->AddTable("queue", false); + _streamTable = _db->AddTable("cmdstream"); + _db->AddColumns(_sourceTable, {"operator", "kernel_w", "kernel_h", "ofm_w", "ofm_h", "ofm_d", "ext_key"}); + _db->AddColumns(_optTable, {"source_id", "operator", "kernel_w", "kernel_h", "ofm_w", "ofm_h", "ofm_d"}); + _db->AddColumns(_cmdTable, {"offset", "cmdstream_id", "optimised_id"}); +} + +Database *OptimiserDatabase::Get() +{ + return _db; +} + +int OptimiserDatabase::SourceId(const void *op) +{ + // lookup op in optimised + auto pos = _optimised.find(op); + if ( pos != std::end(_optimised) ) + { + return std::get<0>(pos->second); + } + else if ( auto ptr = _source.find(op); ptr != std::end(_source) ) + { + // op is original-op + return ptr->second; + } + return 0; +} + +int OptimiserDatabase::OptimisedId(const void *op) +{ + // lookup op in optimised + auto pos = _optimised.find(op); + if ( pos != std::end(_optimised) ) + { + return std::get<1>(pos->second); + } + return 0; +} + +int OptimiserDatabase::SourceOp(const Operation *op, int ext_key) +{ + auto pos = _source.find(op); + if ( pos != _source.end() ) + { + return pos->second; + } + _sourceId++; + _source.emplace(op, _sourceId); + + auto k = op->Kernel()->Size(); + auto o = Shape::PadAxes(op->OFM()->View().ViewShape(), 3, 1); + _db->AddRow(_sourceTable, _sourceId, + {OpTypeToString(op->Type()), std::to_string(k.x), std::to_string(k.y), std::to_string(o.Width()), + std::to_string(o.Height()), std::to_string(o.Depth()), std::to_string(ext_key)}); + return _sourceId; +} + +void OptimiserDatabase::AddOptimised(const void *from, const Operation *to) +{ + assert(to); + + // Locate the source operation Id (if any) + int sourceId = 0; + if ( from != nullptr ) + { + // Look for source op in optimised list first and use that op's parent + // (source replacement doesn't matter) + auto pos = _optimised.find(from); + if ( pos != _optimised.end() ) + { + sourceId = std::get<0>(pos->second); + } + else + { + auto srcPos = _source.find(from); + if ( srcPos != _source.end() ) + { + sourceId = srcPos->second; + } + } + } + + _optId++; + _optimised[to] = std::tuple(sourceId, _optId); + + auto k = to->Kernel()->Size(); + auto o = Shape::PadAxes(to->OFM()->View().ViewShape(), 3, 1); + _db->AddRow(_optTable, _optId, + {std::to_string(sourceId), OpTypeToString(to->Type()), std::to_string(k.x), std::to_string(k.y), + std::to_string(o.Width()), std::to_string(o.Height()), std::to_string(o.Depth())}); +} + +void OptimiserDatabase::AddCommand(void *key, int stream, int cmdIndex) +{ + auto pos = _optimised.find(key); + if ( pos != _optimised.end() ) + { + int optId = std::get<1>(pos->second); + _db->AddRow(_cmdTable, 0, {std::to_string(4 * cmdIndex), std::to_string(stream), std::to_string(optId)}); + } +} + +int OptimiserDatabase::AddStream() +{ + _streamId++; + _db->AddRow(_streamTable, _streamId, {}); + return _streamId; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/graph_optimiser.hpp b/ethosu/regor/compiler/graph_optimiser.hpp new file mode 100644 index 00000000..4df471c1 --- /dev/null +++ b/ethosu/regor/compiler/graph_optimiser.hpp @@ -0,0 +1,212 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/logging.hpp" + +#include "architecture/architecture.hpp" +#include "common/ini_reader.hpp" +#include "graph.hpp" +#include "graph_optimiser_db.hpp" +#include "operation.hpp" +#include "tensor.hpp" + +#include +#include +#include + +#include "include/regor.h" + +namespace regor +{ + +/// +/// Tensor and Operation rewrite functions for the graph optimisation. +/// +template +struct RewriteFunctions +{ + const std::vector tensorFunction; + const std::vector opFunction; +}; + +/// +/// Graph optimiser options +/// +struct GraphOptimiserOptions +{ + bool verboseGraph = false; + bool verboseQuantization = false; +}; + +/// +/// Graph optimiser +/// +class GraphOptimiser +{ +protected: + Architecture *_arch = nullptr; + const GraphOptimiserOptions _options; + OptimiserDatabase *_db; + +public: + GraphOptimiser(Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db) : + _arch(arch), _options(options), _db(db) + { + assert(_arch != nullptr); + } + const GraphOptimiserOptions &Options() const { return _options; } + + + + static std::unique_ptr MakeGraphOptimiser( + GraphNotation notation, Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db); + static void ParseGraphOptimiserOptions(GraphOptimiserOptions &opt, IniReader &reader); + + void Process(Graph *graph); + virtual void OptimiseGraph(Graph *graph) = 0; + + // Note no check for if NPU operator, or "rewrite_unsupported". + // Such checks are delegated to each specific rewrite function. + template + void RewriteGraph(Graph *const graph, const RewriteFunctions &rewriteFuncs) + { + using OpFunction = Operation *(T::*)(Graph *, Operation *); + using TensFunction = Tensor *(T::*)(Graph *, Tensor *); + + const std::vector *opFunctions = &rewriteFuncs.opFunction; + const std::vector *tensFunctions = &rewriteFuncs.tensorFunction; + + // TODO: MLBEDSW-9057: Check when specific rewrite functions are added + struct Entry + { + bool done; + std::shared_ptr op; + }; + std::unordered_set opVisited; + std::unordered_set tensVisited; + std::stack stack; + + for ( const auto &tensor : graph->Outputs() ) + { + for ( const auto &op : tensor->Writers() ) + { + stack.push(Entry{false, op}); + } + } + + while ( !stack.empty() ) + { + Entry entry = stack.top(); + stack.pop(); + + // Currently we do not do anything with entry.done elements. + if ( !entry.done && opVisited.count(entry.op.get()) == 0 && !entry.op->IsDisconnected() ) + { + Operation *updatedOp = entry.op.get(); + Operation *prevOp = nullptr; + + // Process op + if ( !opFunctions->empty() ) + { + // Op have been updated, parse it again. + while ( prevOp != updatedOp ) + { + if ( prevOp != nullptr ) + { + // prevOp was removed and replaced by updatedOp; remove the stale pointer from opVisited + opVisited.erase(prevOp); + } + prevOp = updatedOp; + // Execute operator functions + for ( const auto &func : *opFunctions ) + { + updatedOp = (static_cast(this)->*(func))(graph, updatedOp); + } + } + } + opVisited.insert(updatedOp); + stack.push(Entry{true, updatedOp->shared_from_this()}); + + std::array, 2> tensors; + for ( const auto &pair : updatedOp->Outputs().pairs() ) + { + tensors[0].push_back(pair.second.tensor.get()); + } + for ( const auto &pair : updatedOp->Inputs().pairs() ) + { + tensors[1].push_back(pair.second.tensor.get()); + } + for ( auto idx = 0; idx < 2; ++idx ) + { + for ( const auto &tens : tensors[idx] ) + { + Tensor *updatedTensor = tens; + + // Process Tensor if not already visited + if ( tensVisited.count(tens) == 0 ) + { + Tensor *prevTensor = nullptr; + + if ( !tensFunctions->empty() ) + { + // Tensor have been updated, parse it again. + while ( prevTensor != updatedTensor ) + { + if ( prevTensor != nullptr ) + { + tensVisited.erase(prevTensor); + } + prevTensor = updatedTensor; + // Execute tensor functions + for ( const auto &func : *tensFunctions ) + { + updatedTensor = (static_cast(this)->*(func))(graph, updatedTensor); + } + } + } + tensVisited.insert(updatedTensor); + } + // Check Writers() for tensor, even if visited as we can bypass op updating the tensors. + const std::vector> *ops = &updatedTensor->Writers(); + for ( const auto &op : *ops ) + { + if ( opVisited.count(op.get()) == 0 ) + { + stack.push(Entry{false, op}); + } + } + } + } + } + } + } +#if LOG_TRACE1_ON + Operation *VisitOperatorLog(Graph *const graph, Operation *const operation); + Tensor *VisitTensorLog(Graph *const graph, Tensor *const tensor); +#endif + Operation *RecordOperation(Graph *const graph, Operation *const operation); + Operation *RecordOptimisation(Graph *const graph, Operation *const operation); + void RecordOptimisation(const Operation *operation, const Operation *op); + void PrintGraph(const Graph *graph, const std::string &label) const; + void PrintQuantization(const Graph *graph, const std::string &label) const; + virtual ~GraphOptimiser() = default; +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/graph_optimiser_db.hpp b/ethosu/regor/compiler/graph_optimiser_db.hpp new file mode 100644 index 00000000..45fcc61b --- /dev/null +++ b/ethosu/regor/compiler/graph_optimiser_db.hpp @@ -0,0 +1,60 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "database.hpp" + +#include +#include +#include + +namespace regor +{ + +class Operation; + +/// +/// Graph optimiser database implementation +/// +class OptimiserDatabase +{ +private: + Database *_db = nullptr; + int _sourceId = 0; + int _optId = 0; + int _streamId = 0; + int _sourceTable = 0; + int _optTable = 0; + int _cmdTable = 0; + int _streamTable = 0; + std::unordered_map _source; + std::unordered_map> _optimised; + +public: + OptimiserDatabase(Database *db); + Database *Get(); + int SourceId(const void *op); + int OptimisedId(const void *op); + int SourceOp(const Operation *op, int ext_key = -1); + void AddOptimised(const void *from, const Operation *to); + void AddCommand(void *key, int stream, int cmdIndex); + int AddStream(); +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/graph_packing.cpp b/ethosu/regor/compiler/graph_packing.cpp new file mode 100644 index 00000000..fc8f884b --- /dev/null +++ b/ethosu/regor/compiler/graph_packing.cpp @@ -0,0 +1,249 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "graph_packing.hpp" + +#include "common/logging.hpp" + +#include "graph.hpp" +#include "scheduler_operation.hpp" +#include "tensor.hpp" + +#include +#include +#include +#include +#include + +namespace regor +{ + +GraphPacking::GraphPacking() +{ +} + +std::unique_ptr GraphPacking::Process(std::vector>> &npuOps, + std::vector> &ops, std::unordered_map &tensorAddressMap, const Graph *srcGraph) +{ + // Build a new graph where consecutive operations running on NPU are collapsed into a Ethos-U op + // (OpType::CustomNpuOp). CPU operations are left unchanged. The algorithm makes two passes over the scheduled + // operations, where the first pass creates new operations (Ethos-U and CPU) and connects the CPU operations, and + // the second pass connects the Ethos-U operations. + + // List of all ops in new graph in scheduled order + std::vector newOpsInScheduledOrder; + + std::shared_ptr currentOp = nullptr; + NPUOperation *currentNpuOp = nullptr; + + // Pack consecutive NPU ops into a NPUOperation + for ( auto &schedOp : ops ) + { + if ( schedOp->IsNpuOp() ) + { + if ( !currentNpuOp ) + { + // Create new Ethos-U operation for the new graph + currentOp = std::make_shared(OpType::CustomNpuOp); + + // Create new NPUOperation that collects consecutive NPU operations + auto newNpuOp = std::make_unique(); + currentNpuOp = newNpuOp.get(); + + newOpsInScheduledOrder.push_back(currentOp.get()); + + npuOps.emplace_back(currentOp.get(), std::move(newNpuOp)); + } + + // Map old scheduler operation and its sub-ops to new operation + _oldOpToNewOp[schedOp.get()] = currentOp; + for ( const auto &subOp : schedOp->SubOps() ) + { + assert(subOp->IsNpuOp()); + _oldOpToNewOp[subOp.get()] = currentOp; + } + + currentNpuOp->AddOperation(std::move(schedOp)); + } + else + { + // Create new CPU operation for the new graph + assert(schedOp->_srcKey != nullptr); + currentOp = std::make_shared(*static_cast(schedOp->_srcKey)); + currentNpuOp = nullptr; + + newOpsInScheduledOrder.push_back(currentOp.get()); + + // Map old scheduler operation to new CPU operation + _oldOpToNewOp[schedOp.get()] = currentOp; + + for ( const auto &[usage, schedConn] : schedOp->inputs.pairs() ) + { + const auto &schedTensor = schedConn.tensor; + + // Connect input tensor to new CPU operation + const auto oldTensor = schedTensor->srcTensor; + assert(oldTensor && "Missing source graph tensor"); + const auto newTensor = LookupNewTensor(oldTensor.get(), tensorAddressMap, schedTensor->allocatedAddress); + currentOp->ConnectInput(usage, newTensor).Set(schedConn.shape).Set(schedConn.quantization); + } + + for ( const auto &[usage, schedConn] : schedOp->outputs.pairs() ) + { + const auto &schedTensor = schedConn.tensor; + + // Connect output tensor to new CPU operation + const auto oldTensor = schedTensor->srcTensor; + assert(oldTensor && "Missing source graph tensor"); + const auto newTensor = LookupNewTensor(oldTensor.get(), tensorAddressMap, schedTensor->allocatedAddress); + currentOp->ConnectOutput(usage, newTensor).Set(schedConn.shape).Set(schedConn.quantization); + } + } + } + + currentNpuOp = nullptr; + currentOp = nullptr; + + for ( auto &item : npuOps ) + { + Operation *op = item.first; + + for ( const auto &schedOp : item.second->Operations() ) + { + for ( const auto &schedConn : schedOp->inputs ) + { + const auto &schedTensor = schedConn.tensor; + if ( schedTensor->IsConstant() ) + { + // Don't connect constant tensors - they are handled at scheduler level + continue; + } + + const bool isConsumedByUs = std::any_of(schedTensor->consumers.begin(), schedTensor->consumers.end(), + [&, op](SchedulerOperation *cons) { return _oldOpToNewOp.at(cons).get() == op; }); + const bool isProducedByUsOnly = std::all_of(schedTensor->producers.begin(), schedTensor->producers.end(), + [&, op](SchedulerOperation *prod) { return _oldOpToNewOp.at(prod).get() == op; }); + if ( isConsumedByUs && isProducedByUsOnly && !schedTensor->isGraphInput ) + { + // Don't connect NPU internal tensors + continue; + } + + // Connect input tensor to new Ethos-U operation, but only once + const auto &oldTensor = schedTensor->srcTensor; + assert(oldTensor && "Missing source graph tensor"); + const auto newTensor = LookupNewTensor(oldTensor.get(), tensorAddressMap, schedTensor->allocatedAddress); + if ( op->UsageOfTensor(newTensor.get()) == TensorUsage::None ) + { + const auto usage = MakeTensorUsage(TensorUsage::IFM, op->Inputs().size()); + op->ConnectInput(usage, newTensor).Set(schedConn.quantization); + } + } + + for ( const auto &schedConn : schedOp->outputs ) + { + const auto &schedTensor = schedConn.tensor; + + const bool isProducedByUs = std::any_of(schedTensor->producers.begin(), schedTensor->producers.end(), + [&](SchedulerOperation *prod) { return _oldOpToNewOp.at(prod).get() == op; }); + const bool isConsumedByUsOnly = std::all_of(schedTensor->consumers.begin(), schedTensor->consumers.end(), + [&](SchedulerOperation *cons) { return _oldOpToNewOp.at(cons).get() == op; }); + if ( isProducedByUs && isConsumedByUsOnly && !schedTensor->isGraphOutput ) + { + // Don't connect NPU internal tensors + continue; + } + + // Connect output tensor to new Ethos-U operation, but only once + const auto &oldTensor = schedTensor->srcTensor; + assert(oldTensor && "Missing source graph tensor"); + const auto newTensor = LookupNewTensor(oldTensor.get(), tensorAddressMap, schedTensor->allocatedAddress); + if ( op->UsageOfTensor(newTensor.get()) == TensorUsage::None ) + { + const auto usage = MakeTensorUsage(TensorUsage::OFM, op->Outputs().size()); + op->ConnectOutput(usage, newTensor).Set(schedConn.quantization); + } + } + } + } + + // Clear ops since they have been moved into relevant NPUOperation object + ops.clear(); + _oldOpToNewOp.clear(); + + auto graph = std::make_unique(srcGraph->Notation()); + graph->SetPassthrough(srcGraph->Passthrough()); + + // Transfer graph input tensors from old graph + for ( const auto &graphInput : srcGraph->Inputs() ) + { + graph->AddInput(LookupNewTensor(graphInput.get())); + } + + // Transfer graph output tensors from old graph + for ( const auto &graphOutput : srcGraph->Outputs() ) + { + graph->AddOutput(LookupNewTensor(graphOutput.get())); + } + + _oldTensorToNewTensor.clear(); + + // Save the execution order of all ops in the new graph + graph->SetScheduledOrder(std::move(newOpsInScheduledOrder)); + + return graph; +} + +std::shared_ptr GraphPacking::LookupNewTensor(Tensor *oldTensor) +{ + const auto it = _oldTensorToNewTensor.find(oldTensor); + if ( it == _oldTensorToNewTensor.end() ) + { + // This cloned tensor will be used in the new graph + std::shared_ptr newTensor = oldTensor->Clone(); + + _oldTensorToNewTensor[oldTensor] = newTensor; + + return newTensor; + } + else + { + return it->second; + } +} + +std::shared_ptr GraphPacking::LookupNewTensor( + Tensor *oldTensor, std::unordered_map &tensorAddressMap, Address allocatedAddress) +{ + const auto newTensor = LookupNewTensor(oldTensor); + + // This cloned tensor will use same address as the original tensor + tensorAddressMap[newTensor.get()] = allocatedAddress; + + return newTensor; +} + +std::unique_ptr PackScheduleToGraph(std::vector>> &npuOps, + std::vector> &ops, std::unordered_map &tensorAddressMap, const Graph *srcGraph) +{ + GraphPacking p; + + return p.Process(npuOps, ops, tensorAddressMap, srcGraph); +} + +} // namespace regor diff --git a/ethosu/regor/compiler/graph_packing.hpp b/ethosu/regor/compiler/graph_packing.hpp new file mode 100644 index 00000000..8cbb6b29 --- /dev/null +++ b/ethosu/regor/compiler/graph_packing.hpp @@ -0,0 +1,60 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "graph.hpp" +#include "operation.hpp" +#include "scheduler_operation.hpp" +#include "tensor.hpp" + +#include +#include +#include + +namespace regor +{ + +/// +/// Graph packing +/// +class GraphPacking +{ +public: + GraphPacking(); + +public: + std::unique_ptr Process(std::vector>> &npuOps, + std::vector> &ops, + std::unordered_map &tensorAddressMap, const Graph *srcGraph); + +private: + std::unordered_map> _oldOpToNewOp; + std::unordered_map> _oldTensorToNewTensor; + + std::shared_ptr LookupNewTensor(Tensor *oldTensor); + std::shared_ptr LookupNewTensor( + Tensor *oldTensor, std::unordered_map &tensorAddressMap, Address allocatedAddress); +}; + +// Pack list of scheduler operations into one or more graphs +std::unique_ptr PackScheduleToGraph(std::vector>> &npuOps, + std::vector> &ops, + std::unordered_map &tensorAddressMap, const Graph *srcGraph); + +} // namespace regor diff --git a/ethosu/regor/compiler/graph_validator.cpp b/ethosu/regor/compiler/graph_validator.cpp new file mode 100644 index 00000000..3ca341e7 --- /dev/null +++ b/ethosu/regor/compiler/graph_validator.cpp @@ -0,0 +1,54 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "graph_validator.hpp" + +#include "tosa_graph_validator.hpp" + +namespace regor +{ + +std::unique_ptr GraphValidator::MakeGraphValidator(GraphNotation notation, uint32_t syntaxVersion, Compiler *compiler) +{ + if ( notation == GraphNotation::GraphAPI ) + { + if ( TosaGraphValidator::HandlesSyntax(syntaxVersion) ) + { + return std::make_unique(notation, syntaxVersion, compiler); + } + } + return std::make_unique(notation, syntaxVersion); +} + +bool GraphValidator::Validate(Graph *) +{ + _validationErrors.emplace_back(Error{OpType::None, "Unsupported graph Notation/SyntaxVersion"}); + return false; +} + +std::string GraphValidator::GetErrorMsg() +{ + std::string errorMsg = "Validation error:\n"; + for ( auto &error : _validationErrors ) + { + errorMsg += OpTypeToString(error.operation) + error.errorMessage + "\n"; + } + return errorMsg; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/graph_validator.hpp b/ethosu/regor/compiler/graph_validator.hpp new file mode 100644 index 00000000..09011075 --- /dev/null +++ b/ethosu/regor/compiler/graph_validator.hpp @@ -0,0 +1,59 @@ +// +// SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "graph.hpp" + +#include +#include + +#include "include/regor.h" + +namespace regor +{ + +class Compiler; + +class GraphValidator +{ +public: + struct Error + { + OpType operation; + std::string errorMessage; + }; + +protected: + GraphNotation _notation; + uint32_t _syntaxVersion; + std::vector _validationErrors; + +public: + GraphValidator(GraphNotation notation, uint32_t syntaxVersion) : _notation(notation), _syntaxVersion(syntaxVersion) + { + } + virtual ~GraphValidator() = default; + + static std::unique_ptr MakeGraphValidator(GraphNotation notation, uint32_t syntaxVersion, Compiler *compiler); + virtual bool Validate(Graph *graph); + std::vector &GetErrors() { return _validationErrors; } + std::string GetErrorMsg(); +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/graphir_optimiser.cpp b/ethosu/regor/compiler/graphir_optimiser.cpp new file mode 100644 index 00000000..1dbc99e8 --- /dev/null +++ b/ethosu/regor/compiler/graphir_optimiser.cpp @@ -0,0 +1,187 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "compiler/graphir_optimiser.hpp" + +#include "optimiser_utils.hpp" + +namespace regor +{ + +using namespace GraphOptimisation; +Tensor *GraphIrOptimiser::ConvertInt48Tensors(Graph *, Tensor *tensor) +{ + if ( tensor->Type() == DataType::Int48 && !tensor->IsConstant() ) + { + tensor->ChangeType(DataType::Int64); + } + else if ( tensor->Type() == DataType::UInt48 && !tensor->IsConstant() ) + { + tensor->ChangeType(DataType::UInt64); + } + return tensor; +} + + +Operation *GraphIrOptimiser::ConvertAttributes(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + OpType opType = operation->Type(); + if ( opType == OpType::Asr ) + { + auto roundMode = operation->attr.asr.round ? RoundMode::NATURAL : RoundMode::TRUNCATE_TO_LOWER; + operation->SetRounding(roundMode); + } + return operation; +} + +Operation *GraphIrOptimiser::ConvertResizeOffsets(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + // Reduce positive offset parameters that are larger than scale_n + // If offset >= scale_n, we can create an ifm-slice to start on offset/scale_n. + // The offset parameters are updated to the remainder of the fraction. + Operation *returnOp = operation; + OpType opType = operation->Type(); + if ( opType == OpType::Resize ) + { + auto &attr = operation->attr; + TensorConnection *ifmConn = operation->Input(TensorUsage::IFM); + Shape ifmStart = ifmConn->shape.WithZeros(); + Shape ifmShape = ifmConn->shape; + int offset_h = attr.resize.offsetYX[0]; + int offset_w = attr.resize.offsetYX[1]; + int scale_nh = attr.resize.scaleY.n; + int scale_nw = attr.resize.scaleX.n; + if ( offset_h >= scale_nh ) + { + ifmStart[1] += offset_h / scale_nh; + ifmShape[1] -= ifmStart[1]; + attr.resize.offsetYX[0] = offset_h % scale_nh; + } + if ( offset_w >= scale_nw ) + { + ifmStart[2] += offset_w / scale_nw; + ifmShape[2] -= ifmStart[2]; + attr.resize.offsetYX[1] = offset_w % scale_nw; + } + TensorSlice slice{std::move(ifmStart), std::move(ifmShape)}; + ifmConn->Set(slice); + } + return returnOp; +} + +Operation *GraphIrOptimiser::RemoveReshape(Graph *const graph, Operation *const operation) +{ + Operation *returnOp = operation; + OpType opType = operation->Type(); + + if ( IsReshape(opType) ) + { + auto *ifmConn = operation->Input(TensorUsage::IFM0); + auto *ofmConn = operation->Output(TensorUsage::OFM); + auto *ifm = ifmConn->tensor.get(); + auto *ofm = ofmConn->tensor.get(); + + // Check if ifm/ofm are network ifm/ofm + bool isIfmSgIfm = IsTensorInVector(graph->Inputs(), ifm); + bool isOfmSgOfm = IsTensorInVector(graph->Outputs(), ofm); + bool isIfmSgOfm = IsTensorInVector(graph->Outputs(), ifm); + + // TODO: MLBEDSW-9069: Check CPU operator producer/consumer + + // Inserts a copy op if needed before removing reshapes. + if ( (isIfmSgIfm || isIfmSgOfm) && (isOfmSgOfm) ) + { + auto copyOp = InsertCopyOpAfterTensor(ifmConn->tensor, ifmConn->quantization); + // reset the ifm to reflect the reshape's new ifm + ifmConn = operation->Input(TensorUsage::IFM0); + ifm = ifmConn->tensor.get(); + returnOp = copyOp.get(); + RecordOptimisation(operation, returnOp); + // Reshape still needs to be removed. + } + + // Remove the reshape and one of the tensors. + if ( isOfmSgOfm ) + { + // TODO: This path should also be used for ofm tensors consumed by CPU ops. + + // The OFM is in graph outputs, do not remove this tensor. + // Bypass by replacing ifm with ofm. + // Set OFM as output for IFM producers + ReplaceProducerOutput(ifm->Writers(), ifm, ofmConn->tensor); + + // Set OFM as input to other IFM consumers. + ReplaceConsumerInput(operation, ifm->Readers(), ifm, ofmConn->tensor); + } + else + { + // Bypass by replacing ofm with ifm. + // Set IFM as input to OFM consumers. + ReplaceConsumerInput(nullptr, ofm->Readers(), ofm, ifmConn->tensor); + } + // Remove the reshape from ifm readers and ofm writers. + // Note the Inputs/Outputs on operation should still be intact to not break the traversal. + ifm->RemoveReader(operation->shared_from_this()); + ofm->RemoveWriter(operation->shared_from_this()); + } + + return returnOp; +} + +Operation *GraphIrOptimiser::RewriteFullyConnected(Graph *const graph, Operation *const operation) +{ + Operation *returnOp = operation; + OpType opType = operation->Type(); + if ( opType == OpType::FullyConnected ) + { + const auto &weights = operation->Input(TensorUsage::Weights); + const auto &shape = weights->tensor->StorageShape(); + if ( weights->tensor->AxisOrder() == AxisOrder::OI && shape.Size() == 2 ) + { + // Reshape weight tensor from (num_outputs, ..., num_inputs) to (num_outputs, 1, 1, num_inputs) + weights->tensor->SetAxisOrder(AxisOrder::OHWI); + weights->tensor->Reshape(Shape(shape[0], 1, 1, shape[-1])); + } + assert(weights->tensor->AxisOrder() == AxisOrder::OHWI); + } + + return returnOp; +} + +GraphIrOptimiser::GraphIrOptimiser(Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db) : + GraphOptimiser(arch, options, db) +{ +} + +void GraphIrOptimiser::OptimiseGraph(Graph *graph) +{ + for ( auto iOpt = GraphOptimisationSteps().begin(); iOpt != GraphOptimisationSteps().end(); ++iOpt ) + { + LOG_TRACE1("GraphOptimiser {0}/{1}\n", std::distance(GraphOptimisationSteps().begin(), iOpt) + 1, + GraphOptimisationSteps().size()); + // Check if function lists are empty. Do not call for step that only contain disabled debug functions. + if ( !iOpt->opFunction.empty() || !iOpt->tensorFunction.empty() ) + { + RewriteGraph(graph, *iOpt); + } + } +} + +} // namespace regor diff --git a/ethosu/regor/compiler/graphir_optimiser.hpp b/ethosu/regor/compiler/graphir_optimiser.hpp new file mode 100644 index 00000000..8dfd2899 --- /dev/null +++ b/ethosu/regor/compiler/graphir_optimiser.hpp @@ -0,0 +1,107 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/logging.hpp" + +#include "graph.hpp" +#include "graph_optimiser.hpp" +#include "operation.hpp" +#include "tensor.hpp" + +#include + +namespace regor +{ + +/// +/// GraphIR Graph optimiser +/// +class GraphIrOptimiser : public GraphOptimiser +{ + using OpRewriteFunction = Operation *(GraphIrOptimiser::*)(Graph *, Operation *); + using TensorRewriteFunction = Tensor *(GraphIrOptimiser::*)(Graph *, Tensor *); + using GraphOptStepArray = std::vector>; + +private: + Operation *RemoveReshape(Graph *const graph, Operation *const operation); + Operation *ConvertAttributes(Graph *const graph, Operation *const operation); + Operation *ConvertResizeOffsets(Graph *const graph, Operation *const operation); + Tensor *ConvertInt48Tensors(Graph *graph, Tensor *tensor); + Operation *RewriteFullyConnected(Graph *const graph, Operation *const operation); + +public: + // The graph optimisation steps. + // Order matters, array of rewrites processed in order. + // clang-format off + const GraphOptStepArray _graphOptimisationSteps = + {{ + { + { +#if LOG_TRACE1_ON + &GraphOptimiser::VisitTensorLog +#endif + }, + { +#if LOG_TRACE1_ON + &GraphOptimiser::VisitOperatorLog, +#endif + &GraphOptimiser::RecordOperation + } + }, + { + { + &GraphIrOptimiser::ConvertInt48Tensors, + }, + { + &GraphIrOptimiser::RemoveReshape, + } + }, + { + {}, + { + &GraphIrOptimiser::ConvertAttributes, + &GraphIrOptimiser::ConvertResizeOffsets, + &GraphIrOptimiser::RewriteFullyConnected, + } + }, + { + { +#if LOG_TRACE1_ON + &GraphOptimiser::VisitTensorLog +#endif + }, + { +#if LOG_TRACE1_ON + &GraphOptimiser::VisitOperatorLog, +#endif + &GraphOptimiser::RecordOptimisation + } + } + }}; + // clang-format on + + explicit GraphIrOptimiser(Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db); + + const GraphOptStepArray &GraphOptimisationSteps() const { return _graphOptimisationSteps; } + + void OptimiseGraph(Graph *graph); +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/high_level_command_stream.hpp b/ethosu/regor/compiler/high_level_command_stream.hpp new file mode 100644 index 00000000..86edcaf1 --- /dev/null +++ b/ethosu/regor/compiler/high_level_command_stream.hpp @@ -0,0 +1,273 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "architecture/architecture.hpp" +#include "architecture/weight_encoder.hpp" +#include "common/box.hpp" +#include "common/data_type.hpp" +#include "common/reverse_type.hpp" +#include "common/shape.hpp" +#include "common/transpose_type.hpp" +#include "kernel.hpp" +#include "scheduler_operation.hpp" + +#include +#include + +namespace regor +{ + +enum class HLCRoundMode : uint8_t +{ + DBL = 0, + TRUNCATE = 1, + NATURAL = 2, + TRUNCATE_TO_LOWER = 3, + DOUBLE_ASYMMETRIC = 4, + SYMMETRIC = 5, + AUTO = 0xff +}; + +struct HLCPadding +{ + int top = 0; + int left = 0; + int bottom = 0; + int right = 0; + + std::string ToString() const + { + return fmt::format("[top:{},left:{},bottom:{},right:{}]", top, left, bottom, right); + } +}; + +/// +/// IFM/OFM information needed to generate register commands +/// +struct HLCFeatureMap +{ + TensorFormat format = TensorFormat::Unknown; + MemArea memArea; + Shape shape; + Shape strides; + Point2i stepXY = {1, 1}; + DataType dataType; + Address address = -1; + BufferView bufferView; + Quantization quantization; + ArchResampling resamplingMode = ArchResampling::None; + TransposeType transpose = TransposeType::None; + ReverseType reverse = ReverseType::None; + + int AllocationSizeBytes() const { return TensorAllocationBytes(shape, format, dataType); } + + std::string ToString() const + { + return fmt::format("[{}], format: {}, {}:{}, address: {}", shape.ToString(), format, memArea.memory->Name(), + memArea.usage.ToString(), address); + } +}; + +/// +/// Information about encoded weights +/// +struct HLCWeights +{ + MemArea memArea; + Buffering buffering; + Flags format; + Address address = -1; + int maxRangeBytes; // When double buffering: size of a single buffer + int subStreams = 1; + std::unordered_map encodedRanges; + + std::string ToString() const + { + return fmt::format("{} ranges, buffering: {}, {}:{}, address: {}, format: {}", encodedRanges.size(), + int(buffering), memArea.memory->Name(), memArea.usage, address, format.ToString()); + } +}; + +// Parameters that apply only to particular sub operation +union HLCParameters +{ + // Alpha value for LeakyReLU + struct + { + float alpha; + } leaky_relu; + + // Location of the source LUT, to be DMAed to LUT memory + struct LUT + { + MemArea memArea; + Address address; + int sizeBytes; + DataType ifmType; + } lut; + + struct + { + GraphApi::FractionND scaleY; + GraphApi::FractionND scaleX; + int offsetY; + int offsetX; + ArchResizeMode mode; + } resize; + + struct + { + int axis; + } argmax; +}; + +/// +/// Sub operation +/// +struct HLCSubOperation +{ + OpType type = OpType::None; + HLCParameters parameters = {}; +}; + +/// +/// Contains information needed to generate register commands for an NPU operation. +/// +/// There is one HLCOperation for every SchedulerOperation. Each HLCOperation can be +/// associated with one (= non-cascaded) or more (= cascaded) HLCStripes +/// +struct HLCOperation +{ + OpType type; + Kernel kernel; + std::vector ifm; + HLCFeatureMap ofm; + std::unique_ptr weights; + std::unique_ptr scales; + std::vector subOps; + HLCParameters parameters = {}; + HLCRoundMode rounding; + ArchitectureOpConfig *config = nullptr; + void *_srcKey = nullptr; + +#ifndef NDEBUG + std::string name; // name of OFM +#endif + + std::string ToString() const + { + std::string k = kernel.Size().x == 0 ? "" : kernel.ToString(); +#ifdef NDEBUG + std::string name = ""; +#endif + std::string subOpStr = subOps.empty() ? " -" : ""; + for ( auto &subOp : subOps ) + { + subOpStr += " " + OpTypeToString(subOp.type); + } + return fmt::format("{} {}, subOps:{}, {} {}", OpTypeToString(type), name, subOpStr, k, config ? config->ToString(false) : ""); + } +}; + +class HighLevelCommand +{ +public: + virtual ~HighLevelCommand() = default; + virtual bool IsStripe() const { return false; } + virtual std::string ToString() const = 0; +}; + +/// +/// High level command that performs part of or whole NPU operation, +/// depending on the box settings. +/// +class HLCStripe : public HighLevelCommand +{ +public: + std::shared_ptr operation; + std::vector ifmAreas; + Box ofmArea; + int weightRangeDepth = 0; // Identifies depth slice + HLCPadding padding; + +public: + HLCStripe(const std::shared_ptr &operation_) : operation(operation_) {} + bool IsStripe() const override { return true; } + + std::string ToString() const override + { + std::string ofm = ""; +#ifndef NDEBUG + ofm = " -> " + operation->name; +#endif + std::string extra = ""; + if ( ifmAreas.size() > 1 ) + { + extra = fmt::format(", IFM2 {}", ifmAreas[1].ToString()); + } + else if ( operation->weights != nullptr && operation->weights->buffering != Buffering::None ) + { + extra = fmt::format(", Weight depth: {}", weightRangeDepth); + } + if ( padding.top != 0 || padding.bottom != 0 ) + { + extra += fmt::format(", padding: {}", padding.ToString()); + } + if ( ofmArea.SizeShape().Elements() != operation->ofm.shape.Elements() ) + { + extra += (ofmArea.SizeShape().ElementsWH() == operation->ofm.shape.ElementsWH()) ? ", buffered" : ", cascaded"; + } + return fmt::format("{}{} OFM area {}, IFM {}{}", OpTypeToString(operation->type), ofm, ofmArea.ToString(), + ifmAreas[0].ToString(), extra); + } +}; + +/// +/// High level command that performs a DMA operation. +/// +class HLCDMA : public HighLevelCommand +{ +public: + MemArea srcMemArea; + Address srcAddress; + Shape srcStrides; // Only valid for Ethos U85 + bool srcIndexed; // Only valid for Ethos U85 + MemArea destMemArea; + Address destAddress; + Shape destStrides; // Only valid for Ethos U85 + bool destIndexed; // Only valid for Ethos U85 + MemArea idxMemArea; // Only valid for Ethos U85 + Address idxAddress; // Only valid for Ethos U85 + int idxSkip1; // Only valid for Ethos U85 + int idxMax; // Only valid for Ethos U85 + int length; + Shape sizes; // Only valid for Ethos U85 + + std::string ToString() const override + { + return fmt::format("DMA src: {}:{}, address: {}, dest: {}:{}, address: {}, size: {}", srcMemArea.memory->Name(), + srcMemArea.usage, srcAddress, destMemArea.memory->Name(), destMemArea.usage, destAddress, + sizes ? sizes.ToString() : std::to_string(length)); + } +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.cpp b/ethosu/regor/compiler/high_level_command_stream_generator.cpp new file mode 100644 index 00000000..ec714708 --- /dev/null +++ b/ethosu/regor/compiler/high_level_command_stream_generator.cpp @@ -0,0 +1,813 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "high_level_command_stream_generator.hpp" + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "common/box.hpp" +#include "common/numeric_util.hpp" +#include "common/vector_span.hpp" +#include "high_level_command_stream.hpp" +#include "scheduler.hpp" + +#include +#include + +namespace regor +{ + + +static void CalcPaddingAndSkirt(const Kernel *kernel, const Shape &inputShape, const Shape &outputShape, HLCPadding &padding, HLCPadding &skirt) +{ + auto dilatedWH = kernel->DilatedWH(); + int ypad = NeededTotalPadding(inputShape.Height(), outputShape.Height(), kernel->Stride().y, dilatedWH.y); + int xpad = NeededTotalPadding(inputShape.Width(), outputShape.Width(), kernel->Stride().x, dilatedWH.x); + const auto &pad = kernel->Padding(); + padding.left = pad.Left(); + padding.right = pad.Right(); + padding.top = pad.Top(); + padding.bottom = pad.Bottom(); + skirt.top = padding.top; + skirt.left = padding.left; + skirt.bottom = std::max(ypad - padding.top, dilatedWH.y - 1); + skirt.right = std::max(xpad - padding.left, dilatedWH.x - 1); +} + +enum class TransformLimit +{ + None, + Wrap, +}; + +static Box TransformWithStridesAndSkirt(const Box &outputArea, const Shape *strides, const HLCPadding *skirt, const Shape &ifmShape, + OpType opType, const Shape &concatOffsets, const Shape &splitOffset, const Shape &splitShape, int dilatedKernelHeight, int upscalingFactor, + int &padTop, int &padBottom, TransformLimit limit = TransformLimit::None, TransposeType transposeType = TransposeType::None) +{ + Shape outputAreaStart = outputArea.Start().Untranspose(transposeType); + Shape outputAreaEnd = outputArea.End().Untranspose(transposeType); + Shape concatOffsetsUntransposed = concatOffsets.Untranspose(transposeType); + Shape outputAreaSize = outputAreaEnd - outputAreaStart; + // Make start/end at least 4 dimensional + Shape start = Shape::Max(outputAreaStart - concatOffsetsUntransposed, Shape(0, 0, 0, 0)); + Shape end = Shape::Max(start + outputAreaSize, Shape(1, 1, 1, 1)); + start += splitOffset; + end += splitOffset; + if ( (IsConvolution(opType) && !IsDepthwise(opType)) ) + { + if ( splitOffset.Size() == 0 ) + { + start = start.WithDepth(0); + end = end.WithDepth(ifmShape.Depth()); + } + else + { + start = start.WithDepth(splitOffset.Depth()); + end = end.WithDepth(start.Depth() + splitShape.Depth()); + } + } + else if ( IsVectorProduct(opType) || opType == OpType::ReduceSum ) + { + // these types of operations do a "dot product" or sum over the entire IFM - full shape needed + if ( splitOffset.Size() == 0 ) + { + start = Shape(0, 0, 0, 0); + end = Shape::PadAxes(ifmShape, 4, 1); + } + else + { + start = splitOffset; + end = start + splitShape; + } + } + else if ( opType == OpType::Resize ) + { + // TODO MLBEDSW-8660: Striping of resize operations + return (splitOffset.Size() > 0) ? Box(splitOffset, splitOffset + splitShape) : Box(Shape::PadAxes(ifmShape, 4, 1)); + } + else if ( IsBinaryElementwise(opType) && splitOffset.Size() != 0 ) + { + // Elementwise with splitShape. IFM might not cover full OFM and can be broadcasted in that case + start = splitOffset; + end = start + splitShape; + } + end = Shape::Min(end, Shape::Max(ifmShape, Shape(1, 1, 1, 1)).WithHW(ifmShape.Height() * upscalingFactor, ifmShape.Width() * upscalingFactor)); + padTop = 0; + padBottom = 0; + + assert(strides != nullptr && skirt != nullptr); + assert(strides->Size() == 4); + + int strideW = strides->Width(); + int validIfmOffset = splitOffset.IsEmpty() ? 0 : splitOffset.Width(); + start = start.WithWidth(std::max(start.Width() * strideW - skirt->left, validIfmOffset)); + int validIfmWidth = ifmShape.Width(); + if ( splitShape.Size() > 1 && splitOffset.Size() > 1 ) + validIfmWidth = std::min(validIfmWidth, splitShape.Width() + splitOffset.Width()); + else if ( splitShape.Size() > 1 ) validIfmWidth = splitShape.Width(); + end = end.WithWidth(std::min(end.Width() * strideW + skirt->right, validIfmWidth)); + int strideH = strides->Height(); + int skirtTopRemainder = skirt->top % upscalingFactor; + int totalStride = strideH * (outputAreaEnd.Height() - outputAreaStart.Height() - 1); + int startHeight = start.Height() * strideH - skirt->top + skirtTopRemainder; + padTop = std::max(0, -startHeight) + skirtTopRemainder; + start = start.WithHeight(std::max(startHeight, 0)); + if ( end.Height() * strideH + skirt->bottom > ifmShape.Height() * upscalingFactor ) + { + // padBottom is calculated based the diff between the end position of the weight kernel, + // after last stride and the ifm height. + if ( upscalingFactor != 1 && outputAreaEnd.Height() > ifmShape.Height() * upscalingFactor ) + { + // Special case for Transpose Convolution with VALID padding. + padBottom = outputAreaEnd.Height() - ifmShape.Height() * upscalingFactor; + } + else + { + int kernelStart = start.Height() - padTop; + padBottom = std::max(0, kernelStart + totalStride + dilatedKernelHeight - ifmShape.Height() * upscalingFactor); + } + } + // Adjust for upscaling + start = start.WithHeight(std::max(start.Height() / upscalingFactor, 0)); + int endHeight = end.Height() * strideH + skirt->bottom + skirt->bottom % upscalingFactor; + end = end.WithHeight(std::min(std::max(endHeight / upscalingFactor, 1), ifmShape.Height())); + + if ( limit == TransformLimit::Wrap ) + { + Shape ifmWrap = Shape::PadAxes(ifmShape, 4, 1); + Shape one(1, 1, 1, 1); + start = Shape::Wrap(start, ifmWrap); + end = Shape::Wrap(end - one, ifmWrap) + one; + assert((end - start).Elements() > 0); + } + + return Box(start, end); +} + +static int StrideAdjustedPadding(int pad, int outOffset, int kernelStride) +{ + return pad - outOffset * (kernelStride - 1); +} + +static int MarginForKernel(int paddedInput, int dilatedSize, int stride) +{ + assert(stride > 0); + int margin = paddedInput % stride; + if ( margin == 0 ) + { + margin += stride; + } + return std::max(0, dilatedSize - margin); +} + +static std::pair TransformWithInputOutputSteps(const Box &inputArea, const Point2i &inputStep, + const Box &outputArea, const Point2i &outputStep, class Kernel *kernel, const HLCPadding &padding) +{ + const auto &stride = kernel->Stride(); + const auto dilatedWH = kernel->DilatedWH(); + HLCPadding newPadding; + auto adjustedTopPad = StrideAdjustedPadding(padding.top, outputArea.Start().Height(), stride.y); + auto adjustedLeftPad = StrideAdjustedPadding(padding.left, outputArea.Start().Width(), stride.x); + newPadding.top = std::max(0, DivRoundUp(adjustedTopPad, inputStep.y)); + newPadding.left = std::max(0, DivRoundUp(adjustedLeftPad, inputStep.x)); + Point2i startAdjustForPadFraction; + startAdjustForPadFraction.x = newPadding.left * inputStep.x - adjustedLeftPad; + startAdjustForPadFraction.y = newPadding.top * inputStep.y - adjustedTopPad; + Point2i neededInput; + neededInput.x = DivRoundUp(outputArea.End().Width() - outputArea.Start().Width(), outputStep.x) * stride.x; + neededInput.x += MarginForKernel(neededInput.x, dilatedWH.x, stride.x); + neededInput.y = DivRoundUp(outputArea.End().Height() - outputArea.Start().Height(), outputStep.y) * stride.y; + neededInput.y += MarginForKernel(neededInput.y, dilatedWH.y, stride.y); + Shape newStart = + inputArea.Start() + .WithWidth(inputArea.Start().Width() + startAdjustForPadFraction.x) + .WithHeight(inputArea.Start().Height() + startAdjustForPadFraction.y); + newPadding.bottom = std::max(0, + neededInput.y - + (DivRoundUp((inputArea.End().Height() - inputArea.Start().Height()) - startAdjustForPadFraction.y, inputStep.y) + + newPadding.top)); + newPadding.right = std::max(0, + neededInput.x - + (DivRoundUp(inputArea.End().Width() - inputArea.Start().Width() - startAdjustForPadFraction.x, inputStep.x) + + newPadding.left)); + return std::make_pair(Box(newStart, inputArea.End()), std::move(newPadding)); +} + +// Calculates STRIDE_C/Y/X +static Shape GetStrides(const HLCFeatureMap &fm) +{ + auto elemSize = DataTypeSizeBits(fm.dataType) / 8; + if ( fm.format == TensorFormat::NHWC ) + { + int strideC = elemSize; + int strideX = fm.shape.Depth() * strideC; + int strideY = fm.shape.Width() * strideX; + int strideN = fm.shape.Height() * strideY; + return Shape(strideN, strideY, strideX, strideC); + } + else if ( fm.format == TensorFormat::NHCWB16 ) + { + int strideX = 16 * elemSize; + int strideC = strideX * fm.shape.Width(); + int strideY = elemSize * fm.shape.Width() * RoundAway(fm.shape.Depth(), 16); + int strideN = fm.shape.Height() * strideY; + return Shape(strideN, strideY, strideX, strideC); + } + else + { + assert(false && "Unsupported tensor format"); + return Shape(0, 0, 0, 0); + } +} + +static void MakeFeatureMap(const SchedulerConnection *schedConn, HLCFeatureMap &fm) +{ + auto schedTens = schedConn->tensor.get(); + fm.shape = schedConn->shape; + fm.dataType = schedTens->dataType; + fm.memArea = schedTens->memArea; + fm.format = schedTens->format; + fm.address = schedTens->allocatedAddress; + fm.quantization = schedConn->quantization; + fm.bufferView = schedTens->bufferView; + fm.strides = GetStrides(fm); + fm.stepXY = schedConn->stepXY; + fm.transpose = schedConn->transpose; + fm.reverse = schedConn->reverse; + fm.resamplingMode = schedConn->resamplingMode; +} + +static std::unique_ptr MakeWeights(NpuWeightTensor *srcTensor, Buffering buffering, SchedulerTensor *bufTensor = nullptr) +{ + auto weights = std::make_unique(); + if ( buffering == Buffering::None ) + { + assert(!bufTensor); + } + if ( bufTensor == nullptr ) + { + bufTensor = srcTensor; + } + weights->address = bufTensor->allocatedAddress; + weights->memArea = bufTensor->memArea; + weights->buffering = buffering; + // Same function is used for generating scales - scales have no config or weight format, so set to default + weights->format = srcTensor->config ? srcTensor->config->Format() : Flags(WeightFormat::Default); + weights->maxRangeBytes = srcTensor->maxRangeBytes; + weights->subStreams = srcTensor->subStreams; + weights->encodedRanges = srcTensor->encodedRanges; + return weights; +} + +static std::shared_ptr MakeOperation(SchedulerOperation *schedOp, SchedulerOpInfo *opInfo) +{ + auto op = std::make_shared(); + op->type = schedOp->Type(); + op->kernel = *schedOp->Kernel(); + op->config = opInfo->Config(); + op->rounding = HLCRoundMode(schedOp->Rounding()); + op->_srcKey = schedOp->_srcKey; + for ( int i = 0; i < MAX_NUM_IFM; ++i ) + { + auto ifm = schedOp->TryIFM(i); + if ( ifm != nullptr ) + { + HLCFeatureMap fm; + MakeFeatureMap(ifm, fm); + op->ifm.push_back(fm); + } + } + MakeFeatureMap(schedOp->OFM(), op->ofm); +#ifndef NDEBUG + op->name = schedOp->OFM()->tensor->Name(); +#endif + if ( opInfo->npuWeightsTensor != nullptr ) + { + assert(schedOp->TryInput(TensorUsage::Weights) != nullptr); + op->weights = MakeWeights(opInfo->npuWeightsTensor.get(), opInfo->bufferedWeightTensor.buffering, + opInfo->bufferedWeightTensor.tensor.get()); + } + + if ( opInfo->npuScalesTensor != nullptr ) + { + // Only scales encoded + op->scales = MakeWeights(opInfo->npuScalesTensor.get(), Buffering::None); + } + else if ( schedOp->TryInput(TensorUsage::Scales) != nullptr ) + { + // Weights and scales encoded together + assert(!!opInfo->npuWeightsTensor); + op->scales = MakeWeights(opInfo->npuWeightsTensor.get(), opInfo->bufferedWeightTensor.buffering, + opInfo->bufferedWeightTensor.tensor.get()); + } + + auto lutConn = schedOp->TryInput(TensorUsage::LUT); + if ( lutConn != nullptr ) + { + // Add sub op for operations using LUT + HLCSubOperation lutSubOp; + lutSubOp.type = OpType::LUT; + auto ¶m = lutSubOp.parameters.lut; + auto lut = lutConn->tensor; + auto end = schedOp->SubOps().end(); + auto subOp = std::find_if(schedOp->SubOps().begin(), end, [](auto &so) { return so->Type() == OpType::LUT; }); + // Register command stream generator will allocate the LUT + // in LUT memory and generate DMA for the LUT; for this + // it must know the location of the tensor in read-only memory + param.memArea = lut->memArea; + param.address = lut->allocatedAddress; + param.sizeBytes = lut->AllocationSizeBytes(); + param.ifmType = subOp != end ? (*subOp)->IFM(0)->tensor->dataType : schedOp->IFM(0)->tensor->dataType; + op->subOps.push_back(std::move(lutSubOp)); + } + for ( auto &subOp : schedOp->SubOps() ) + { + if ( subOp->Type() != OpType::LUT ) + { + HLCSubOperation hlcSubOp; + hlcSubOp.type = subOp->Type(); + // TODO: add op type specific info + if ( subOp->Type() == OpType::LeakyRelu ) + { + const auto ¶meters = subOp->Parameters(); + hlcSubOp.parameters.leaky_relu.alpha = parameters.leaky_relu.alpha; + } + op->subOps.push_back(std::move(hlcSubOp)); + } + } + const auto ¶meters = schedOp->Parameters(); + const auto &attr = schedOp->Attributes(); + const auto &ifmShape = schedOp->IFM(0)->shape; + switch ( schedOp->Type() ) + { + case OpType::LeakyRelu: + op->parameters.leaky_relu.alpha = parameters.leaky_relu.alpha; + break; + case OpType::Resize: + op->parameters.resize.scaleY = attr.resize.scaleY; + op->parameters.resize.scaleX = attr.resize.scaleX; + op->parameters.resize.offsetY = attr.resize.offsetYX[0]; + op->parameters.resize.offsetX = attr.resize.offsetYX[1]; + if ( ifmShape.Width() == 1 && ifmShape.Height() == 1 ) + { + // 1x1 IFMs can be handled with replicate + op->parameters.resize.mode = ArchResizeMode::Replicate; + } + else if ( attr.resize.mode == tosa::ResizeMode::NEAREST ) + { + op->parameters.resize.mode = ArchResizeMode::Nearest; + } + else + { + op->parameters.resize.mode = ArchResizeMode::Bilinear; + } + break; + case OpType::ArgMax: + op->parameters.argmax.axis = attr.axis.axis; + break; + default: + break; + } + return op; +} + +// Finds the next stripe command in the stream +static HLCStripe *FindNextStripe(HLCStream &cmds, int fromIndex) +{ + int sz = int(cmds.size()); + for ( int i = fromIndex; i < sz; ++i ) + { + if ( cmds[i]->IsStripe() ) + { + return static_cast(cmds[i].get()); + } + } + assert(fromIndex != 0); // Every stream should contain at least one stripe + return nullptr; +} + +// Generates DMA command for Scatter/Gather +void HLCStreamGenerator::GenerateHLCDMACommands(SchedulerOperation *op, const std::shared_ptr &hlcOp, HLCStream &cmds) +{ + UNUSED(op); + + auto opType = hlcOp->type; + assert(opType == OpType::Scatter || opType == OpType::Gather); + + int ifmSrc = 0; + + if ( opType == OpType::Scatter ) + { + auto &ifm = hlcOp->ifm[0]; // GraphIR Scatter values_in + auto &ofm = hlcOp->ofm; // GraphIR Scatter values_out + assert(ifm.AllocationSizeBytes() == ofm.AllocationSizeBytes()); + + // Generate HLCDMA that copies values_in to values_out + auto dma = std::make_unique(); + dma->srcMemArea = ifm.memArea; + dma->srcAddress = ifm.address; + dma->srcStrides = GetStrides(ifm); + dma->destMemArea = ofm.memArea; + dma->destAddress = ofm.address; + dma->destStrides = GetStrides(ofm); + dma->length = ifm.AllocationSizeBytes(); + + cmds.push_back(std::move(dma)); + + ifmSrc = 2; + } + + auto &valFm = hlcOp->ifm[0]; // GraphIR Scatter values_in or GraphIR Gather values + auto &idxFm = hlcOp->ifm[1]; // GraphIR Scatter indicies or GraphIR Gather indices + auto &srcFm = hlcOp->ifm[ifmSrc]; // GraphIR Scatter input or GraphIR Gather values + auto &ofm = hlcOp->ofm; // GraphIR Scatter values_out or GraphIR Gather output + assert(idxFm.dataType == DataType::Int32 || idxFm.dataType == DataType::Int64); + assert(srcFm.dataType == ofm.dataType); + + // Generate HLCDMA that scatters or gathers + auto dma = std::make_unique(); + dma->srcMemArea = srcFm.memArea; + dma->srcAddress = srcFm.address; + dma->srcIndexed = (opType == OpType::Gather); + dma->idxMemArea = idxFm.memArea; + dma->idxAddress = idxFm.address; + dma->destMemArea = ofm.memArea; + dma->destAddress = ofm.address; + dma->destIndexed = (opType == OpType::Scatter); + dma->length = DataTypeStorageSizeBytes(srcFm.dataType, srcFm.shape[-1]); + dma->idxMax = valFm.shape[-2] - 1; + + auto srcStrides = GetStrides(srcFm); + auto destStrides = GetStrides(ofm); + + if ( opType == OpType::Scatter && idxFm.dataType == DataType::Int64 ) + { + // Do scatter in 3D mode with index skip because HW can only use int32 indicies + dma->srcStrides = Shape(srcStrides[-2], 0, srcStrides[-1]); + dma->destStrides = Shape(0, destStrides[-2], destStrides[-1]); + dma->sizes = idxFm.shape.Extract({-1, -2}); + dma->idxSkip1 = 4; + } + else if ( opType == OpType::Gather && idxFm.dataType == DataType::Int64 ) + { + // Do gather in 3D mode with index skip because HW can only use int32 indicies + dma->srcStrides = Shape(0, srcStrides[-2], srcStrides[-1]); + dma->destStrides = Shape(destStrides[-2], 0, destStrides[-1]); + dma->sizes = idxFm.shape.Extract({-1, -2}); + dma->idxSkip1 = 4; + } + else + { + // Do scatter or gather in 2D mode + dma->destStrides = std::move(destStrides); + dma->srcStrides = std::move(srcStrides); + dma->sizes = idxFm.shape.Extract({-2, -1}); + dma->idxSkip1 = 0; + } + + cmds.push_back(std::move(dma)); +} + +// Generates DMA command for weights +static std::unique_ptr GenerateWeightDMA(NpuWeightTensor *weightTens, const SchedulerConnection &bufConn, int depth, int depthIndex) +{ + auto dma = std::make_unique(); + dma->srcMemArea = weightTens->memArea; + dma->srcAddress = weightTens->allocatedAddress; + dma->length = 0; + int offset0 = 0; // offset of the first substream + for ( int subStream = 0; subStream < weightTens->subStreams; ++subStream ) + { + auto item = weightTens->encodedRanges.find(WeightKey(subStream, depth)); + if ( item == weightTens->encodedRanges.end() ) + { + assert(subStream > 0); + } + else + { + if ( subStream == 0 ) + { + offset0 = item->second.offset; + dma->srcAddress += offset0; + } + dma->length = RoundAway(item->second.offset + item->second.TotalBytes() - offset0, 16); + } + } + dma->destMemArea = bufConn.tensor->memArea; + dma->destAddress = bufConn.tensor->allocatedAddress; + if ( bufConn.buffering == Buffering::Double && depthIndex % 2 == 1 ) + { + dma->destAddress += weightTens->maxRangeBytes; + } + return dma; +} + +void HLCStreamGenerator::GenerateHLCStripeCommands(SchedulerOperation *op, const std::shared_ptr &hlcOp, HLCStream &cmds) +{ + auto opInfo = _schedule->Cost(op); + HLCPadding skirt; + HLCPadding padding; + auto kernel = op->Kernel(); + assert(kernel != nullptr && "Operators must have a kernel"); + Shape strides = Shape(1, kernel->Stride().y, kernel->Stride().x, 1); + auto opType = op->Type(); + auto ofmConn = op->OFM(); + auto ifm0Conn = op->IFM(0); + const auto &ofmShape = ofmConn->SliceShape(); + const auto &ifm0Shape = ifm0Conn->SliceShape(); + + auto *ifm1Conn = op->TryIFM(1); + auto maxIfmShape = ifm0Shape; + if ( ifm1Conn && IsBinaryElementwise(opType) ) + { + // Use full ifm shape for broadcast elementwise operators + maxIfmShape = Shape::Max(ifm0Conn->SliceShape(), ifm1Conn->SliceShape()); + } + CalcPaddingAndSkirt(kernel, maxIfmShape, ofmShape, padding, skirt); + + int upscaling = 1; + if ( opType == OpType::Conv2DBackpropInputSwitchedBias ) + { + upscaling = ofmShape.Height() / ifm0Shape.Height(); + } + auto &depthSlices = opInfo->ofmDepthSlices; + int dilatedKernelHeight = kernel->DilatedWH().y; + + // Define Start and End coordinates for the OFM + auto ofmStart = Shape(0, 0, 0, depthSlices[0]); + auto ofmEnd = Shape::PadAxes(ofmShape, 4, 1); + if ( ofmConn->slice.offset.Size() > 0 ) + { + ofmStart = ofmConn->slice.offset; + ofmEnd = ofmConn->slice.offset + ofmConn->slice.shape; + } + assert(hlcOp->ifm.size() <= 2); + + // Binary elementwise using broadcast to repeat smaller IFMs over larger IFM volumes need their + // coordinates to wrap at the limits of the smaller IFM volume. + TransformLimit ifmLimit = IsBinaryElementwise(op->Type()) ? TransformLimit::Wrap : TransformLimit::None; + + const auto &ofmStep = opInfo->stripe; + for ( int startHeight = ofmStart.Height(); startHeight < ofmEnd.Height(); startHeight += ofmStep.Height() ) + { + int endHeight = std::min(startHeight + ofmStep.Height(), ofmEnd.Height()); + for ( int startWidth = ofmStart.Width(); startWidth < ofmEnd.Width(); startWidth += ofmStep.Width() ) + { + int endWidth = std::min(startWidth + ofmStep.Width(), ofmEnd.Width()); + for ( int depthIndex = 0; depthIndex < int(depthSlices.size()) - 1; ++depthIndex ) + { + int startChannel = std::max(depthSlices[depthIndex], ofmStart.Depth()); + int endChannel = std::min(depthSlices[depthIndex + 1], ofmEnd.Depth()); + + // Construct the output area for the current stripe + auto outputAreaStart = Shape(ofmStart.Batch(), startHeight, startWidth, startChannel); + auto outputAreaEnd = Shape(ofmEnd.Batch(), endHeight, endWidth, endChannel); + auto outputArea = Box(outputAreaStart, outputAreaEnd); + auto hlcStripe = std::make_unique(hlcOp); + hlcStripe->padding = padding; + hlcStripe->ofmArea = outputArea; + for ( unsigned ifmIndex = 0; ifmIndex < hlcOp->ifm.size(); ++ifmIndex ) + { + auto ifmConn = op->IFM(ifmIndex); + // Calculate input area based on the output area + auto inputArea = TransformWithStridesAndSkirt(outputArea, &strides, &skirt, ifmConn->shape, opType, + ofmConn->slice.offset, ifmConn->slice.offset, ifmConn->slice.shape, dilatedKernelHeight, + upscaling, hlcStripe->padding.top, hlcStripe->padding.bottom, ifmLimit, ofmConn->transpose); + if ( ofmConn->stepXY != Point2i{1, 1} || ifmConn->stepXY != Point2i{1, 1} ) + { + std::tie(inputArea, hlcStripe->padding) = TransformWithInputOutputSteps( + inputArea, ifmConn->stepXY, outputArea, ofmConn->stepXY, kernel, hlcStripe->padding); + } + hlcStripe->ifmAreas.push_back(inputArea); + } + if ( opInfo->npuWeightsTensor != nullptr ) + { + hlcStripe->weightRangeDepth = startChannel; + if ( opInfo->bufferedWeightTensor.tensor != nullptr && + (startHeight == ofmStart.Height() || opInfo->bufferedWeightTensor.buffering == Buffering::Double) ) + { + assert(opInfo->npuWeightsTensor->config->DepthOffsets().size() == depthSlices.size()); + // Metadata of new weights to put into the weight buffer tensor + auto newWeights = std::make_tuple(opInfo->npuWeightsTensor->equivalenceId, startChannel, depthIndex); + if ( _filledWeightBuffers.count(opInfo->bufferedWeightTensor.tensor.get()) == 0 ) + { + // There is nothing in the weights buffer tensor yet + cmds.push_back(GenerateWeightDMA(opInfo->npuWeightsTensor.get(), + opInfo->bufferedWeightTensor, startChannel, depthIndex)); + } + else + { + auto ¤tWeights = _filledWeightBuffers[opInfo->bufferedWeightTensor.tensor.get()]; + if ( currentWeights != newWeights ) + { + // There is something in the weights buffer tensor, but it's not correct + cmds.push_back(GenerateWeightDMA(opInfo->npuWeightsTensor.get(), + opInfo->bufferedWeightTensor, startChannel, depthIndex)); + } + } + _filledWeightBuffers[opInfo->bufferedWeightTensor.tensor.get()] = newWeights; + } + } + else + { + hlcStripe->weightRangeDepth = -1; + } + cmds.push_back(std::move(hlcStripe)); + } + } + } +} + +void HLCStreamGenerator::GenerateCommands(SchedulerOperation *op, const std::shared_ptr &hlcOp, HLCStream &cmds) +{ + auto opType = op->Type(); + + if ( IsDma(opType) ) + { + GenerateHLCDMACommands(op, hlcOp, cmds); + } + else + { + GenerateHLCStripeCommands(op, hlcOp, cmds); + } +} + +void HLCStreamGenerator::GenerateCommandsForCascade(vector_span> cascadedOps, + vector_span> hlcOps, const CascadeInfo *cascadeInfo, HLCStream &cmds) +{ + // High level command stream for each individual operation + std::vector cmdsForOps; + std::vector currIndex; + // Performed stripe at each operation + std::vector availableStripe; + // Next stripe to be performed at each operation + std::vector nextStripe; + int nrOps = cascadedOps.size(); + assert(cascadeInfo != nullptr); + // Apply intermediate feature map shapes to cascaded operations + for ( int i = 1; i < nrOps; ++i ) + { + auto item = cascadeInfo->buffers.find(*cascadedOps[i]); + if ( item == cascadeInfo->buffers.end() ) + { + assert(false); + } + else + { + auto &shape = item->second.shape; + hlcOps[i - 1]->ofm.shape = shape; + hlcOps[i]->ifm[cascadedOps[i]->PrimaryIfmIndex()].shape = shape; + } + } + // Generate high level commands for every operation in the cascade; + // keep the generated streams in separate lists + for ( int i = 0; i < nrOps; ++i ) + { + HLCStream stream; + GenerateCommands(cascadedOps[i].get(), hlcOps[i], stream); + currIndex.push_back(0); + availableStripe.push_back(nullptr); + nextStripe.push_back(FindNextStripe(stream, 0)); + cmdsForOps.push_back(std::move(stream)); + } + // Combine the generated command streams for the individual operations to a single stream. + // A command on one level can only performed when its input has been produced at the previous level. + int opIndex = 0; + while ( true ) + { + int &ix = currIndex[opIndex]; + if ( opIndex == 0 || + nextStripe[opIndex]->ifmAreas[cascadedOps[opIndex]->PrimaryIfmIndex()].End().IsSubShapeOf( + availableStripe[opIndex - 1]->ofmArea.End()) ) + { + auto &stream = cmdsForOps[opIndex]; + assert(ix < int(stream.size())); + HighLevelCommand *hlc = stream[ix].get(); + cmds.push_back(std::move(cmdsForOps[opIndex][ix])); + ++ix; + if ( hlc->IsStripe() ) + { + availableStripe[opIndex] = nextStripe[opIndex]; + nextStripe[opIndex] = FindNextStripe(stream, ix); + if ( opIndex < nrOps - 1 && + nextStripe[opIndex + 1]->ifmAreas[cascadedOps[opIndex + 1]->PrimaryIfmIndex()].End().IsSubShapeOf( + availableStripe[opIndex]->ofmArea.End()) ) + { + // Enough output has been produced to continue at next level + ++opIndex; + } + if ( nextStripe[opIndex] == nullptr ) + { + // Finished + assert(opIndex >= nrOps - 1); + break; + } + } + } + else + { + // More input is needed from the previous level + --opIndex; + } + } +} + +HLCStream HLCStreamGenerator::GenerateCommandStream(const NPUOperation *npuOp, const Schedule *schedule, bool verbose) +{ + HLCStream cmds; + _schedule = schedule; + auto &npuOps = npuOp->Operations(); + // Create HLCOperation for every ScheduledOperation + std::vector> hlcOps; + for ( auto &schedOp : npuOps ) + { + auto op = schedOp.get(); + hlcOps.push_back(MakeOperation(op, schedule->Cost(op))); + } + // Generate the command stream + int sz = int(npuOps.size()); + for ( int i = 0; i < sz; ++i ) + { + auto op = npuOps[i].get(); + auto opInfo = schedule->Cost(op); + assert(opInfo != nullptr); + auto &hlcOp = hlcOps[i]; + if ( opInfo->cascade == 0 ) + { + // Single operation, not in cascade + GenerateCommands(op, hlcOp, cmds); + } + else + { + // Cascaded operation: generate commands for all operations in the cascade + auto cascadeInfo = _schedule->Cascade(opInfo->cascade); + assert(cascadeInfo != nullptr); + assert(op->Index() == cascadeInfo->start); + // Note: below code assumes: + // - all operations in a cascade are in the same NPU op + // - operations in a cascade are contiguous + // - operations in the npuOp appear in same order as in the schedule + int cascadeSize = cascadeInfo->end - cascadeInfo->start + 1; + assert(i + cascadeSize <= sz); + vector_span> cascadedOps(npuOps, i, i + cascadeSize); + vector_span> cascadedHlcOps(hlcOps, i, i + cascadeSize); + GenerateCommandsForCascade(cascadedOps, cascadedHlcOps, cascadeInfo, cmds); + i += cascadeSize - 1; + } + } + if ( verbose ) + { + PrintCommandStream(npuOp, hlcOps, cmds); + } + return cmds; +} + +void HLCStreamGenerator::PrintCommandStream(const NPUOperation *npuOp, std::vector> &hlcOps, HLCStream &cmds) +{ + LOG_PRINT("High level NPU operations:\n"); + int opIndex = 0; + for ( auto &schedOp : npuOp->Operations() ) + { + auto op = schedOp.get(); + const auto hlcOp = hlcOps[opIndex].get(); + LOG_PRINT("{} {}\n", opIndex, hlcOp->ToString()); + LOG_PRINT(" IFM: {}, {}\n", op->IFM(0)->tensor->Name(), hlcOp->ifm[0].ToString()); + if ( hlcOp->ifm.size() > 1 ) + { + LOG_PRINT(" IFM2: {}, {}\n", op->IFM(1)->tensor->Name(), hlcOp->ifm[1].ToString()); + } + if ( hlcOp->ifm.size() > 2 ) + { + LOG_PRINT(" IFM3: {}, {}\n", op->IFM(2)->tensor->Name(), hlcOp->ifm[2].ToString()); + } + LOG_PRINT(" OFM: {}, {}\n", op->OFM()->tensor->Name(), hlcOp->ofm.ToString()); + if ( hlcOp->weights != nullptr ) + { + LOG_PRINT(" Weights: {}, {}\n", op->Input(TensorUsage::Weights)->tensor->Name(), hlcOp->weights->ToString()); + } + ++opIndex; + } + LOG_PRINT("High level command stream:\n"); + for ( unsigned i = 0; i < cmds.size(); ++i ) + { + LOG_PRINT("{} {}\n", i, cmds[i]->ToString()); + } +} + +} // namespace regor diff --git a/ethosu/regor/compiler/high_level_command_stream_generator.hpp b/ethosu/regor/compiler/high_level_command_stream_generator.hpp new file mode 100644 index 00000000..e3e127ab --- /dev/null +++ b/ethosu/regor/compiler/high_level_command_stream_generator.hpp @@ -0,0 +1,63 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "cascade_builder.hpp" +#include "common/vector_span.hpp" +#include "high_level_command_stream.hpp" +#include "scheduler.hpp" +#include "scheduler_operation.hpp" + +#include +#include +#include + +namespace regor +{ + +using HLCStream = std::vector>; + +/// +/// High level command stream generator +/// +class HLCStreamGenerator +{ +public: + // Generates high level command stream for the scheduled operations in the given NPU op + HLCStream GenerateCommandStream(const NPUOperation *npuOp, const Schedule *schedule, bool verbose); + +private: + // Generates one or more HLCStripe commands from a given operation and adds them to the stream + void GenerateHLCStripeCommands(SchedulerOperation *op, const std::shared_ptr &hlcOp, HLCStream &cmds); + // Generates one or more HLCDMA commands from a given operation and adds them to the stream + void GenerateHLCDMACommands(SchedulerOperation *op, const std::shared_ptr &hlcOp, HLCStream &cmds); + // Generates high level commands for the given operation and adds them to the command stream + void GenerateCommands(SchedulerOperation *op, const std::shared_ptr &hlcOp, HLCStream &cmds); + // Generates high level commands for all operations in the cascade and adds them to the command stream + void GenerateCommandsForCascade(vector_span> cascadedOps, + vector_span> hlcOps, const CascadeInfo *cascadeInfo, HLCStream &cmds); + void PrintCommandStream(const NPUOperation *npuOp, std::vector> &hlcOps, HLCStream &cmds); + + // Tracking what has been put in the weight buffers + std::unordered_map> _filledWeightBuffers; + + const Schedule *_schedule = nullptr; +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/hillclimb_allocator.cpp b/ethosu/regor/compiler/hillclimb_allocator.cpp new file mode 100644 index 00000000..d69671d0 --- /dev/null +++ b/ethosu/regor/compiler/hillclimb_allocator.cpp @@ -0,0 +1,355 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "hillclimb_allocator.hpp" + +#include "common/numeric_util.hpp" + +#include +#include +#include +#include + +namespace regor +{ + +constexpr Address MAX_ADDRESS = std::numeric_limits
::max(); +constexpr Address NOT_ALLOCATED = -1; + +void HillClimbAllocator::SetLiveRanges(const std::vector> &liveRanges, int alignment) +{ + int maxEndTime = 0; + int id = 0; + for ( const auto &lr : liveRanges ) + { + HillClimbLiveRange hlr = {}; + + hlr.startTime = lr->startTime; + hlr.endTime = lr->endTime; + hlr.size = RoundAway(lr->size, alignment); + hlr.id = id; + maxEndTime = std::max(maxEndTime, lr->endTime); + lrs.push_back(hlr); + ++id; + } + lrsAtTime.resize(maxEndTime + 1); + sizeAtTime.resize(maxEndTime + 1); + neighbours.resize(lrs.size()); + // Calculate which live ranges are active at every timestamp + for ( int t = 0; t <= maxEndTime; ++t ) + { + lrsAtTime[t].clear(); + } + for ( auto &lr : lrs ) + { + for ( auto t = lr.startTime; t <= lr.endTime; ++t ) + { + lrsAtTime[t].push_back(&lr); + } + } + minRequiredSize = 0; + for ( int t = 0; t <= maxEndTime; ++t ) + { + // Calculate minimum needed size at each timestamp + Address neededSize = 0; + for ( auto &lr : lrsAtTime[t] ) + { + neededSize += lr->size; + } + sizeAtTime[t] = neededSize; + minRequiredSize = std::max(neededSize, minRequiredSize); + // Calculate all neighbours + for ( unsigned i = 0; i < lrsAtTime[t].size(); ++i ) + { + auto lr1 = lrsAtTime[t][i]; + auto &nb1 = neighbours[lr1->id]; + for ( auto j = i + 1; j < lrsAtTime[t].size(); ++j ) + { + auto lr2 = lrsAtTime[t][j]; + if ( find(nb1.begin(), nb1.end(), lr2) == nb1.end() ) + { + nb1.push_back(lr2); + neighbours[lr2->id].push_back(lr1); + } + } + } + } + targetSize = minRequiredSize; + // Calculate the urgency of each live range + lrUrgency.resize(lrs.size()); + for ( unsigned i = 0; i < lrs.size(); ++i ) + { + auto &lr = lrs[i]; + Address urgency = 0; + for ( auto t = lr.startTime; t <= lr.endTime; ++t ) + { + urgency = std::max(sizeAtTime[t], urgency); + } + lrUrgency[i] = urgency; + } +} + +Address HillClimbAllocator::Allocate(const std::vector> &liveRanges, int alignment, Address sizeLimit) +{ + SetLiveRanges(liveRanges, alignment); + maxAllowedSize = sizeLimit; + iterations = 0; + std::vector indices; + int sz = int(liveRanges.size()); + // Initial solution, using a heuristic allocator + for ( int i = 0; i < sz; ++i ) + { + indices.push_back(i); + } + SortIndicesOnPrio(indices); + // Allocate the initial solution + bestSize = MAX_ADDRESS; + bestSize = AllocateIndices(indices); + if ( bestSize <= targetSize ) + { + // The heuristic allocation returned an optimal solution. + // No need to search. + } + else + { + // Try to improve the heuristic allocation + Search(indices, MAX_ITERATIONS); + } + // Allocate addresses + for ( int i = 0; i < sz; ++i ) + { + liveRanges[i]->SetAddress(lrs[i].address); + } + return bestSize; +} + +void HillClimbAllocator::AllocateLr(HillClimbLiveRange &lr) const +{ + Address address = 0; + int predecessor = NO_PREDECESSOR; + bool fits = false; + while ( !fits ) + { + fits = true; + // Find neighbours that overlap with address + for ( auto lr2_p : neighbours[lr.id] ) + { + if ( lr2_p->address == NOT_ALLOCATED || lr2_p->endAddress <= address ) + { + continue; + } + if ( lr2_p->Overlaps(address, lr.size) ) + { + // Overlap found; increase address + fits = false; + address = lr2_p->endAddress; + predecessor = lr2_p->id; + } + } + } + lr.address = address; + lr.endAddress = address + lr.size; + lr.predecessor = predecessor; +} + +Address HillClimbAllocator::AllocateIndices(const std::vector &indices) +{ + ++iterations; + int sz = int(indices.size()); + std::vector count(sz); + for ( auto &lr : lrs ) + { + lr.address = NOT_ALLOCATED; + } + Address size = 0; + for ( int turn = 0; size <= bestSize && turn < sz; ++turn ) + { + auto &lr = lrs[indices[turn]]; + AllocateLr(lr); + lr.turn = turn; + size = std::max(size, lr.endAddress); + } + return size; +} + +void HillClimbAllocator::SortIndicesOnPrio(std::vector &indices) const +{ + std::sort(indices.begin(), indices.end(), + [this](int const &a, int const &b) + { + // urgent first + if ( lrUrgency[a] != lrUrgency[b] ) + { + return lrUrgency[a] > lrUrgency[b]; + } + auto &lr1 = lrs[a]; + auto &lr2 = lrs[b]; + // long duration before short duration + auto duration1 = lr1.endTime - lr1.startTime; + auto duration2 = lr2.endTime - lr2.startTime; + if ( duration1 != duration2 ) + { + return duration1 > duration2; + } + if ( lr1.startTime != lr2.startTime ) + { + return lr1.startTime < lr2.startTime; + } + if ( lr1.size != lr2.size ) + { + return lr1.size > lr2.size; + } + return lr1.id < lr2.id; + }); +} + +void HillClimbAllocator::AddPredecessorTurns(std::set &turns, const HillClimbLiveRange &lr) const +{ + turns.insert(lr.turn); + int id = lr.id; + while ( lrs[id].predecessor != NO_PREDECESSOR ) + { + id = lrs[id].predecessor; + turns.insert(lrs[id].turn); + } +} + +void HillClimbAllocator::AttemptBottleneckFix(std::vector &indices, int iterationsStuck) +{ + // Find the bottleneck + HillClimbLiveRange *maxLr = &lrs[0]; + for ( auto &lr : lrs ) + { + if ( lr.endAddress > maxLr->endAddress ) + { + maxLr = &lr; + } + } + // Find all live ranges that affected the placement of the bottleneck live range. + // This consists of two types of live ranges: + // - direct neighbours of the bottleneck live range + // - direct and indirect predecessors of these neighbours + bottleneck + // The turns at which these live ranges were allocated are put in the turns vector. + std::set turns; + AddPredecessorTurns(turns, *maxLr); + for ( auto lr_p : neighbours[maxLr->id] ) + { + AddPredecessorTurns(turns, *lr_p); + } + // Non-direct neighbours that interfere with the allocation of the bottleneck are the + // immediate cause for gaps in the allocation, and are selected with higher probability. + std::vector turnList; + std::vector nonNbTurnList; + for ( auto turn : turns ) + { + turnList.push_back(turn); + auto &lr = lrs[indices[turn]]; + if ( !maxLr->IsNeighbour(lr) ) + { + nonNbTurnList.push_back(turn); + } + } + // Pick from non-neighbour list with 30% probability (magic number based on tuning) + int ix1; + if ( rng() % 100 < 30 && !nonNbTurnList.empty() ) + { + // Pick a live range from the "non-neighbour list" + ix1 = nonNbTurnList[rng() % nonNbTurnList.size()]; + } + else + { + // Pick any affecting live range. + ix1 = turnList[rng() % turnList.size()]; + } + // Note: turnList has always at least 2 elements for bottlenecks + int ix2 = turnList[rng() % (turnList.size() - 1)]; + if ( ix1 == ix2 ) + { + ix2 = turnList[turnList.size() - 1]; + } + // Swap indices + std::swap(indices[ix1], indices[ix2]); + if ( iterationsStuck > MAX_ITERATIONS_STUCK ) + { + // The best allocation has not improved for a while, maybe improvement is not possible + // by single-swapping indices; add more neighbour live ranges and swap 2 more indices. + // Adding more neighbours can sometimes resolve the situation where the current bottleneck + // is resolved, but always results in a higher bottleneck at a nearby live range. + // Magic number is based on tuning + for ( auto turn : nonNbTurnList ) + { + for ( auto lr_p : neighbours[indices[turn]] ) + { + if ( turns.count(lr_p->turn) == 0 ) + { + turns.insert(lr_p->turn); + turnList.push_back(lr_p->turn); + } + } + } + ix1 = turnList[rng() % turnList.size()]; + ix2 = turnList[rng() % turnList.size()]; + std::swap(indices[ix1], indices[ix2]); + } +} + +void HillClimbAllocator::Search(std::vector &indices, int iters) +{ + std::vector bestIndices = indices; + std::vector bestLrs = lrs; + int lastImprovementIteration = 0; + + for ( int i = 0; bestSize > maxAllowedSize && i < iters && i - lastImprovementIteration < MIN_ITERATIONS_IMPROVE; ++i ) + { + // Reorder the indices + AttemptBottleneckFix(indices, i - lastImprovementIteration); + // Allocate the reordered indices and check if it gave an improvement + auto newSize = AllocateIndices(indices); + if ( newSize <= bestSize ) + { + // The new allocation produced a new best result; remember it + if ( newSize < bestSize ) + { + lastImprovementIteration = i; + } + bestSize = newSize; + bestIndices = indices; + bestLrs = lrs; + if ( bestSize <= targetSize ) + { + // Target reached; stop + return; + } + } + else + { + // The new allocation produced worse result; undo the change + indices = bestIndices; + lrs = bestLrs; + } + } + lrs = std::move(bestLrs); +} + +Address HillClimbAllocateLiveRanges(LiveRangeGraph &lrGraph, int alignment, Address sizeLimit) +{ + HillClimbAllocator allocator; + return allocator.Allocate(lrGraph.LiveRanges(), alignment, sizeLimit); +} + +} // namespace regor diff --git a/ethosu/regor/compiler/hillclimb_allocator.hpp b/ethosu/regor/compiler/hillclimb_allocator.hpp new file mode 100644 index 00000000..bfbfb343 --- /dev/null +++ b/ethosu/regor/compiler/hillclimb_allocator.hpp @@ -0,0 +1,172 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/architecture.hpp" +#include "live_range.hpp" + +#include +#include +#include +#include +#include + +namespace regor +{ + +struct HillClimbLiveRange +{ + // Start time, input to the allocator + int startTime; + // End time (inclusive), input to the allocator + int endTime; + // Size, input to the allocator + int size; + // Allocated address, output of the allocator + Address address; + // End address, exclusive + Address endAddress; + // Index of this live range + int id; + // id of predecessor live range (predecessor's end address == this lr's address) + int predecessor; + // Turn at which the live range was allocated + int turn; + + bool Overlaps(Address addr2, Address size2) const { return address < addr2 + size2 && addr2 < endAddress; } + bool IsNeighbour(const HillClimbLiveRange &lr) const { return startTime <= lr.endTime && lr.startTime <= endTime; } +}; + +// Implementation of a tensor allocator using state space exploration. +// +// The basic algorithm is: +// +// - Use a heuristic allocator to find an initial allocation +// - while allocation is not optimal and iterations < MAX_ITERATIONS: +// - find the "bottleneck": the live range with highest end address +// - find all live ranges that affected the allocation of the bottleneck +// - swap the order of any two affecting live ranges +// - reallocate tensors using the reordered live ranges +// - if the new allocation is better: keep it, else set allocation to previous allocation +class HillClimbAllocator +{ +private: + static constexpr int MAX_ITERATIONS = 99999; + // Special handling if best solution has not improved during this many iterations + static constexpr int MAX_ITERATIONS_STUCK = 50; + // Minimum number of iterations since the last improvement (unless an optimal solution is found) + static constexpr int MIN_ITERATIONS_IMPROVE = 5000; + // Used for live ranges allocated at address 0 + static constexpr int NO_PREDECESSOR = -1; + // Contains the live ranges + std::vector lrs; + // Contains active live ranges at each timestamp + std::vector> lrsAtTime; + // + // Contains neighbours of each live range (indexed by lr.id), i.e. + // live ranges with overlapping start/end time. + std::vector> neighbours; + // + // At each timestamp: accumulated size of active live ranges + std::vector
sizeAtTime; + // + // For each live range: max value of sizeAtTime (only used in the heuristic allocation) + std::vector
lrUrgency; + // + // The maximum allowed size (the size of the physical available memory) + Address maxAllowedSize = 0; + // The minimum possible size, assuming all live ranges can be perfectly allocated + Address minRequiredSize = 0; + // The algorithm stops once the target size has been achieved + Address targetSize = 0; + // The highest end address of the best found allocation + Address bestSize = 0; + // Number of performed iterations + int iterations = 0; + // Random number generator; use default seed (which is well-defined) + std::mt19937 rng; + +public: + // Runs the allocation algorithm and updates the address field of lrs. + // Finishes when the target size has been reached or when maximum iterations have been run. + // + // Implementation note: the algorithm produces reproducible results by using + // a well-defined random number generator with well-defined default seed, + // and using a fixed number of iterations. + Address Allocate(const std::vector> &lrs, int alignment, Address sizeLimit); + + Address MinimumRequiredSize() const { return minRequiredSize; } + int Iterations() const { return iterations; } + +private: + void SetLiveRanges(const std::vector> &liveRanges, int alignment); + + // Allocates the given live range at the smallest possible address + void AllocateLr(HillClimbLiveRange &lr) const; + // + // Allocates the live ranges in the order indicated by the indices; + // allocates each live range at the lowest possible address. + + Address AllocateIndices(const std::vector &indices); + + // Sorts live ranges based on heuristics, used for the initial allocation + void SortIndicesOnPrio(std::vector &indices) const; + + // Adds the given live range + predecessors to the turns vector + void AddPredecessorTurns(std::set &turns, const HillClimbLiveRange &lr) const; + + // Finds the "bottleneck", the live range with highest end address, and reorders the indices + // such that a next allocation might lower the memory usage. + // + // --------- + // | | + // | D | + // | | + // ---------------------------------- + // | B | + // ------------------------------- + // | | + // |A| --- + // | | |C| + // | | | | + // --------------------------------------- + // + // In the above example, the allocation order was [A, B, C, D] and D is the resulting bottle-neck. + // The live ranges that affected the allocation of D are the direct neighbours of D (i.e. B and C), + // and all direct and indirect predecessors of D and its neighbours + // (i.e. A, which is the predecessor of B, and indirect predecessor of D). + // + // By permuting the order in which the affecting live ranges are allocated, the bottleneck might + // be lowered. In the above example, almost any permutation would lower the bottleneck. + // + // Note that there is room to improve the efficiency of the algorithm. + // One way could be to first allocate all direct neighbours of the bottleneck + // (i.e. B, C, D) and then the other affecting live ranges (i.e. A). The algorithm currently does + // not actively try this, as it may lead to allocation loops (A could become the new bottle-neck); + // it just uses a higher probability of selecting A. + void AttemptBottleneckFix(std::vector &indices, int iterationsStuck); + + // Search for a solution, using the given indices as initial solution. + void Search(std::vector &indices, int iterations); +}; + +// Wrapper function to perform live range allocation +Address HillClimbAllocateLiveRanges(LiveRangeGraph &lrGraph, int alignment, Address sizeLimit); + +} // namespace regor diff --git a/ethosu/regor/compiler/kernel.hpp b/ethosu/regor/compiler/kernel.hpp new file mode 100644 index 00000000..7bea642e --- /dev/null +++ b/ethosu/regor/compiler/kernel.hpp @@ -0,0 +1,161 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "common/numeric_util.hpp" +#include "include/graphapi.hpp" + +#include +#include + +namespace regor +{ + +class Margin +{ +private: + int _top = 0; + int _left = 0; + int _bottom = 0; + int _right = 0; + int _near = 0; + int _far = 0; + +public: + Margin(int top, int left, int bottom, int right) : _top(top), _left(left), _bottom(bottom), _right(right) {} + Margin(int top, int left, int bottom, int right, int near, int far) : + _top(top), _left(left), _bottom(bottom), _right(right), _near(near), _far(far) + { + } + + Margin() = default; + + int Top() const { return _top; } + int Left() const { return _left; } + int Bottom() const { return _bottom; } + int Right() const { return _right; } + int Near() const { return _near; } + int Far() const { return _far; } + + bool IsZero() const { return !(_top | _left | _bottom | _right | _near | _far); } + + std::string ToString() const + { + return fmt::format("[t:{},l:{},b:{},r:{},n:{},f:{}]", _top, _left, _bottom, _right, _near, _far); + } +}; + +/// +/// Kernel parameters +/// +class Kernel +{ +private: + Point2i _size; + Point2i _stride; + Point2i _dilation; + int _sizeZ = 0; + int _strideZ = 0; + int _dilationZ = 0; + Margin _padding; + int _depthMultiplier = 0; + + +public: + Kernel(const GraphApi::GraphKernel *kernel) + { + _size = Point2i(kernel->sizeYXZ[1], kernel->sizeYXZ[0]); + _sizeZ = kernel->sizeYXZ[2]; + _stride = Point2i(kernel->strideYXZ[1], kernel->strideYXZ[0]); + _strideZ = kernel->strideYXZ[2]; + _dilation = Point2i(kernel->dilationYXZ[1], kernel->dilationYXZ[0]); + _dilationZ = kernel->dilationYXZ[2]; + _padding = Margin(kernel->paddingTBLRNF[0], kernel->paddingTBLRNF[2], kernel->paddingTBLRNF[1], + kernel->paddingTBLRNF[3], kernel->paddingTBLRNF[4], kernel->paddingTBLRNF[5]); + _depthMultiplier = 0; + assert(_size.x > 0 && _size.y > 0); + } + + Kernel(Point2i size, Point2i stride, Point2i dilation, int depthMultiplier = 1, Margin padding = Margin(0, 0, 0, 0)) + { + assert(size.x > 0 && size.y > 0); + assert(stride.x > 0 && stride.y > 0); + _size = size; + _stride = stride; + _dilation = dilation; + _depthMultiplier = depthMultiplier; + _padding = padding; + } + Kernel() = default; + + int ElementsWH() const { return _size.x * _size.y; } + const Point3 Size3D() const { return {_size.x, _size.y, _sizeZ}; } + const Point3 Stride3D() const { return {_stride.x, _stride.y, _strideZ}; } + const Point3 Dilation3D() const { return {_dilation.x, _dilation.y, _dilationZ}; } + const Point2i &Size() const { return _size; } + const Point2i &Stride() const { return _stride; } + const Point2i &Dilation() const { return _dilation; } + int DepthMultiplier() const { return _depthMultiplier; } + const Margin &Padding() const { return _padding; } + + Kernel WithSize(Point2i size) const + { + Kernel tmp(*this); + tmp._size = size; + return tmp; + } + + Kernel WithStride(Point2i stride) const + { + Kernel tmp(*this); + tmp._stride = stride; + return tmp; + } + + Kernel WithDilation(Point2i dilation) const + { + Kernel tmp(*this); + tmp._dilation = dilation; + return tmp; + } + + Kernel WithPadding(Margin padding) const + { + Kernel tmp(*this); + tmp._padding = padding; + return tmp; + } + + Point2i DilatedWH() const { return (_dilation * (_size - Point2i(1, 1))) + Point2i(1, 1); } + + std::string ToString() const + { + return fmt::format("size={},{} stride={},{}, dilation={},{} padding={}", _size.x, _size.y, _stride.x, _stride.y, + _dilation.x, _dilation.y, _padding.ToString()); + } +}; + +static inline int RequiredInputSize(int value, int stride, int border, int upscale, int rounding = 0) +{ + return int(std::ceil(float((value - 1) * stride + border + rounding) / float(upscale))); +} + +} // namespace regor diff --git a/ethosu/regor/compiler/live_range.cpp b/ethosu/regor/compiler/live_range.cpp new file mode 100644 index 00000000..9671045c --- /dev/null +++ b/ethosu/regor/compiler/live_range.cpp @@ -0,0 +1,233 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +#include "live_range.hpp" + +#include "architecture/architecture.hpp" +#include "scheduler.hpp" +#include "scheduler_operation.hpp" +#include "tensor.hpp" + +#include +#include +#include +#include +#include +#include + +namespace regor +{ + +std::vector LiveRangeGraph::GetTemporalMemoryUsage(int &maxUsage) +{ + std::vector usage(_currentTime + 1); + for ( const auto &lr : _lrs ) + { + assert(lr->endTime <= _currentTime); + for ( int i = lr->startTime; i <= lr->endTime; ++i ) + { + usage[i] += lr->size; + } + } + maxUsage = *std::max_element(usage.begin(), usage.end()); + return usage; +} + +void LiveRangeGraph::ExtractLiveRangesFromCascades(const std::vector> &schedOps, + Schedule *schedule, const MemArea &targetMemory, bool addRollingBuffers) +{ + std::unordered_map timeForCascade; + auto startTime = _currentTime; + // Live ranges containing graph output + std::vector graphOutputRanges; + for ( const auto &schedOp : schedOps ) + { + SchedulerOpInfo *opInfo = schedule->Cost(schedOp.get()); + int cascade = opInfo->cascade; + + CascadeInfo *cascadeInfo = cascade == 0 ? nullptr : &schedule->cascades[cascade]; + CascadeBuffer *cascadeBuffer = nullptr; + + if ( cascadeInfo == nullptr ) + { + // Check if op have an ifm tensor that can be reused for the ofm + auto ifmTens = ReusableIFM(schedOp, targetMemory); + if ( ifmTens != nullptr ) + { + // ifm can be reused + FuseRanges(ifmTens, schedOp->OFM()->tensor.get()); + } + } + + int timeToSet = _currentTime; + if ( cascadeInfo != nullptr ) + { + auto entry = cascadeInfo->buffers.find(*schedOp); + if ( entry != cascadeInfo->buffers.end() ) + { + cascadeBuffer = &entry->second; + } + auto tfcEntry = timeForCascade.find(cascade); + if ( tfcEntry != timeForCascade.end() ) + { + timeToSet = tfcEntry->second; + } + } + opInfo->timeIndex = timeToSet; + + // Mark usage for all relevant tensors related to this operation + for ( auto &liveTensor : schedOp->LiveRangeTensors() ) + { + auto usage = liveTensor.first; + auto tens = liveTensor.second; + bool isRollingBuffer = cascadeBuffer != nullptr && usage == MakeTensorUsage(TensorUsage::IFM, schedOp->PrimaryIfmIndex()); + if ( ShouldBeIgnored(tens, targetMemory) && !(addRollingBuffers && isRollingBuffer) ) + { + continue; + } + auto lr = GetOrCreateRange(tens); + if ( tens->isGraphInput ) + { + // Graph input must not be overwritten by preceding schedOps + lr->MarkUsage(startTime); + } + if ( tens->isGraphOutput ) + { + // Graph output must not be overwritten by following schedOps + graphOutputRanges.push_back(lr); + } + lr->MarkUsage(timeToSet); + if ( isRollingBuffer ) + { + // This tensor is a rolling buffer in a cascade and the size of the LiveRange needs to be modified + // for enabling temporal memory snapshots without modifying the original Tensor + lr->size = cascadeBuffer->sizeBytes; + } + } + // Buffered weight tensor + auto weightTens = opInfo->bufferedWeightTensor.tensor.get(); + if ( !ShouldBeIgnored(weightTens, targetMemory) ) + { + auto lr = GetOrCreateRange(weightTens); + if ( opInfo->bufferedWeightTensor.preBuffer ) + { + lr->MarkUsage(timeToSet - 1, 2); + } + else + { + lr->MarkUsage(timeToSet); + } + } + // Read-only weight/scale tensors + for ( auto tens : {opInfo->npuWeightsTensor, opInfo->npuScalesTensor} ) + { + if ( !ShouldBeIgnored(tens.get(), targetMemory) ) + { + auto lr = GetOrCreateRange(tens.get()); + lr->MarkUsage(timeToSet); + } + } + if ( timeToSet == _currentTime ) + { + _currentTime += 2; + } + if ( cascade != 0 ) + { + timeForCascade[cascade] = timeToSet; + } + } + for ( auto lr : graphOutputRanges ) + { + lr->MarkUsage(_currentTime, 1); + ++_currentTime; + } +} + +LiveRange *LiveRangeGraph::GetOrCreateRange(SchedulerTensor *tens) +{ + // Return the live range of the tensor (or any of its clones) + const auto entry = _equivalenceIdToLr.find(tens->equivalenceId); + if ( entry != _equivalenceIdToLr.end() ) + { + entry->second->AddTensor(tens); + return entry->second; + } + // No live range found for the tensor, create a new one + auto lr = std::make_shared(tens); + _lrs.push_back(lr); + _equivalenceIdToLr[tens->equivalenceId] = lr.get(); + return lr.get(); +} + +LiveRange *LiveRangeGraph::FuseRanges(SchedulerTensor *inTens, SchedulerTensor *outTens) +{ + assert(outTens->AllocationSizeBytes() <= inTens->AllocationSizeBytes()); + auto lr = GetOrCreateRange(inTens); + lr->AddTensor(outTens); + const auto entry = _equivalenceIdToLr.find(outTens->equivalenceId); + if ( entry != _equivalenceIdToLr.end() ) + { + // Live range already existed for outTens, move over tensors + auto &lr2 = entry->second; + lr->tensors.insert(lr2->tensors.begin(), lr2->tensors.end()); + lr2->tensors.clear(); + lr2->size = 0; + } + _equivalenceIdToLr[outTens->equivalenceId] = lr; + return lr; +} + +SchedulerTensor *LiveRangeGraph::ReusableIFM(const std::unique_ptr &schedOp, const MemArea &targetMemory) +{ + SchedulerTensor *reusableIfm = nullptr; + if ( IsElementwise(schedOp->Type()) ) + { + // Check if possible to merge ifm/ofm live ranges of elementwise op + const auto ofmConn = schedOp->OFM(); + const auto ofmTens = ofmConn->tensor.get(); + + if ( !ShouldBeIgnored(ofmTens, targetMemory) ) + { + for ( const auto &[usage, ifmConn] : schedOp->inputs.pairs() ) + { + const auto ifmTens = ifmConn.tensor.get(); + + if ( IsIFM(usage) && !ifmTens->isGraphOutput && ifmConn.shape == ofmConn->shape && + ifmTens->format == ofmTens->format && ifmTens->dataType == ofmTens->dataType && + !ShouldBeIgnored(ifmTens, targetMemory) && ifmTens->consumers.size() == 1 && ofmTens->producers.size() == 1 ) + { + reusableIfm = ifmTens; + break; + } + } + } + } + return reusableIfm; +} + +bool LiveRangeGraph::ShouldBeIgnored(SchedulerTensor *tens, const MemArea &targetMemory) +{ + if ( tens == nullptr ) + { + return true; + } + return tens->memArea != targetMemory; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/live_range.hpp b/ethosu/regor/compiler/live_range.hpp new file mode 100644 index 00000000..2a22a6fc --- /dev/null +++ b/ethosu/regor/compiler/live_range.hpp @@ -0,0 +1,112 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/architecture.hpp" +#include "scheduler.hpp" +#include "scheduler_operation.hpp" +#include "tensor.hpp" + +#include +#include +#include +#include +#include +#include + +namespace regor +{ + +/// +/// Live range +/// +struct LiveRange +{ + // Tensors with equivalence ids that are assigned to the same LiveRange will be allocated to the same address + std::unordered_set tensors; + // Time at which the live range's tensors start being used. + int startTime = std::numeric_limits::max(); + // Note: the end time is inclusive + int endTime = -1; + int size = 0; + MemArea memArea; + std::string name = ""; + + LiveRange(SchedulerTensor *tensor) + { + size = tensor->AllocationSizeBytes(); + memArea = tensor->memArea; + name = tensor->Name(); + AddTensor(tensor); + } + + void AddTensor(SchedulerTensor *tensor) { tensors.insert(tensor); } + + void MarkUsage(int opTime, int opDuration = 1) + { + assert(opDuration >= 0); + int opTimeStart = std::max(opTime, 0); + int opTimeEnd = opTime + opDuration; + if ( opTimeEnd > opTimeStart ) + { + startTime = std::min(startTime, opTimeStart); + endTime = std::max(endTime, opTimeEnd); + } + } + + void SetAddress(Address address) + { + for ( auto &tensor : tensors ) + { + tensor->allocatedAddress = address; + } + } + + std::string ToString() const + { + return fmt::format("", name, startTime, endTime, size); + } +}; + +class LiveRangeGraph +{ +private: + /** All allocated live ranges */ + std::vector> _lrs; + /** Map from equivalence id -> live range */ + std::unordered_map _equivalenceIdToLr; + int _currentTime = 0; + +public: + virtual ~LiveRangeGraph() = default; + int EndTime() const { return _currentTime + 1; } + + std::vector> LiveRanges() const { return _lrs; }; + + /** usage[t] will be set to the memory usage at time t, for each timestamp t in the live graph */ + std::vector GetTemporalMemoryUsage(int &maxUsage); + void ExtractLiveRangesFromCascades(const std::vector> &schedOps, + Schedule *schedule, const MemArea &targetMemory, bool addRollingBuffers); + LiveRange *GetOrCreateRange(SchedulerTensor *tens); + LiveRange *FuseRanges(SchedulerTensor *inTens, SchedulerTensor *outTens); + SchedulerTensor *ReusableIFM(const std::unique_ptr &schedOp, const MemArea &targetMemory); + virtual bool ShouldBeIgnored(SchedulerTensor *tens, const MemArea &targetMemory); +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/network_performance.cpp b/ethosu/regor/compiler/network_performance.cpp new file mode 100644 index 00000000..c47868c5 --- /dev/null +++ b/ethosu/regor/compiler/network_performance.cpp @@ -0,0 +1,317 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "network_performance.hpp" + +#include "common/common.hpp" + +#include "database.hpp" +#include "graph_optimiser.hpp" + +#include + +BEGIN_ENUM_TABLE(regor::AccessType) + ADD_ENUM_NAME(Lut) + ADD_ENUM_NAME(FeatureMap) + ADD_ENUM_NAME(Weights) + ADD_ENUM_NAME(Scales) +END_ENUM_TABLE() + +namespace regor +{ +NetworkPerformance::NetworkPerformance(Architecture *arch, const std::vector> &ops) : + _arch(arch), _ops(ops) +{ + assert(arch); +} + +PerformanceResult NetworkPerformance::Measure(Schedule *schedule, OptimiserDatabase *optDb) +{ + SchedulerOperation *prevOp = nullptr; + SchedulerOpInfo *prevCost = nullptr; + PerformanceResult performance; + Database *db = nullptr; + std::unordered_set memories({_arch->ReadonlyMemory().memory, _arch->FeatureMapMemory().memory, + _arch->LUTMemory().memory, _arch->StagingMemory().memory}); + std::unordered_set regions( + {_arch->ReadonlyMemory(), _arch->FeatureMapMemory(), _arch->LUTMemory(), _arch->StagingMemory()}); + int opTable = 0; + int opTableColumnCount = 0; + std::unordered_set tensorUids; + + if ( optDb ) + { + db = optDb->Get(); + opTable = db->AddTable("perf"); + std::vector columns = { + "source_id", + "optimised_id", + "operator", + "name", + "staging_usage", + "op_cycles", + "npu_cycles", + "mac_count", + }; + for ( const auto &mem : memories ) + { + std::string label = mem->Name() + "_ac"; + columns.push_back(label); + } + db->AddColumns(opTable, columns); + opTableColumnCount = int(columns.size()); + } + + for ( auto const &schedOp : _ops ) + { + SchedulerOpInfo *cost = schedule->Cost(schedOp.get()); + PerformanceResult perf = {}; + if ( schedOp->IsNpuOp() ) + { + perf = EstimateFullOpPerformance(schedOp.get(), cost, prevOp, prevCost); + perf.npuOps = 1; + perf.memory[_arch->StagingMemory().memory].peakUsage = schedule->MemoryUsageAt(cost->timeIndex); + + // Calculate total original and encoded weights + // Weight statistics is not set on a per-operation level as some operations share weight tensors + SchedulerConnection *weightConn = schedOp->TryInput(TensorUsage::Weights); + if ( weightConn && cost->npuWeightsTensor ) + { + // check if the weight tensor has already been accounted for in total weights + auto pos = tensorUids.find(weightConn->tensor->uid); + if ( pos == std::end(tensorUids) ) + { + tensorUids.insert(weightConn->tensor->uid); + performance.originalWeights += weightConn->tensor->AllocationSizeBytes(); + performance.encodedWeights += cost->npuWeightsTensor->totalWeightBytes; + } + } + } + else + { + perf.cpuCycles = 1; // TODO: model CPU cycle counts + perf.cpuOps = 1; + } + // Insert any missing memories + for ( ArchitectureMemory *a : memories ) + { + perf.memory.emplace(a, PerformanceResult::MemoryAccesses{}); + } + + if ( optDb != nullptr ) + { + AddToDatabase(perf, schedOp, opTable, opTableColumnCount, memories, optDb); + } + + performance += perf; + prevOp = schedOp.get(); + prevCost = cost; + } + // TODO: Remove this line and separate memory allocation from usage. + performance.memory[_arch->StagingMemory().memory].peakUsage = 0; + + for ( auto ®ion : regions ) + { + // RHS is not peak usage, but peak allocation. + performance.memory[region.memory].peakUsage += schedule->memoryUsage[region]; + } + + performance.cascades = schedule->cascades.size(); + + return performance; +} + +void NetworkPerformance::AddToDatabase(const PerformanceResult &perf, const std::unique_ptr &schedOp, + int opTable, int /*opTableColumnCount*/, const std::unordered_set &memories, OptimiserDatabase *optDb) +{ + // Per-layer calculations + assert(optDb != nullptr); + std::vector row; + std::string opName = "N/A"; + Database *db = optDb->Get(); + + const auto *conn = schedOp->TryOFM(); + if ( conn != nullptr && conn->tensor != nullptr && conn->tensor->srcTensor != nullptr ) + { + opName = conn->tensor->srcTensor->Name(); + } + + int sourceId = optDb->SourceId(schedOp->_srcKey); + int optId = optDb->OptimisedId(schedOp->_srcKey); + row = { + std::to_string(sourceId), + std::to_string(optId), + OpTypeToString(schedOp->Type()), + std::move(opName), + std::to_string(perf.memory.at(_arch->StagingMemory().memory).peakUsage), + std::to_string(perf.totalCycles), + std::to_string(perf.npuCycles), + std::to_string(perf.macCount), + }; + + for ( const auto mem : memories ) + { + row.push_back(std::to_string(perf.memory.at(mem).AccessCycles())); + } + + db->AddRow(opTable, schedOp->Index(), std::move(row)); +} + + +PerformanceResult NetworkPerformance::EstimateFullOpPerformance( + SchedulerOperation *schedOp, SchedulerOpInfo *cost, SchedulerOperation *prevOp, SchedulerOpInfo *prevCost) +{ + UNUSED(prevOp); + PerformanceQuery query = Scheduler::InitPerfQuery(schedOp, cost->Config(), -1); + std::vector fused = Scheduler::InitFusionQuery(schedOp); + + CycleCost cycles = _arch->Performance()->MeasureCycleCost(query, fused); + + PerformanceResult result; + result.npuCycles = cycles.opCycles; + result.macCount = cycles.macs; + + if ( cost->cascade != 0 ) + { + result.cascadedOps = 1; + } + + ElementAccess access = _arch->Performance()->MeasureElementAccess(query); + ElementAccess byteAccess = _arch->Performance()->ElementTransferToBytes(query, access); + + // How many NPU cycles are available under the previously executing + // operator for performing buffered DMA transfers + int64_t slackCycles = (prevCost != nullptr) ? prevCost->slackBufferingCycles : 0; + + // LUT transfer stats + auto lut = schedOp->TryInput(TensorUsage::LUT); + int64_t lutTransferCycles = 0; + + if ( lut ) + { + auto srcMemory = lut->tensor->memArea.memory; + auto dstMemory = _arch->LUTMemory().memory; + assert(srcMemory); + + if ( (srcMemory != nullptr) && (dstMemory != srcMemory) ) + { + int copySize = lut->PartialAllocationSizeBytes(); + lutTransferCycles = _arch->Performance()->MemToMemCycles(dstMemory, srcMemory, copySize); + + result.memory[srcMemory].access[AccessType::Lut].bytesRead += copySize; + result.memory[dstMemory].access[AccessType::Lut].bytesWritten += copySize; + } + } + + // Memory that NPU will source weights from for operations + ArchitectureMemory *weightsMemory = cost->npuWeightsTensor ? cost->npuWeightsTensor->memArea.memory : nullptr; + + if ( weightsMemory && cost->bufferedWeightTensor.tensor ) + { + // DMA Weight Transfer + int initialSize = 0; + + // Get the size of the first DMA + for ( int streamIndex = 0; streamIndex < cost->npuWeightsTensor->subStreams; streamIndex++ ) + { + auto pos = cost->npuWeightsTensor->encodedRanges.find(streamIndex); + if ( pos != cost->npuWeightsTensor->encodedRanges.end() ) + { + initialSize += pos->second.TotalBytes(); + } + } + + auto srcWeightMem = weightsMemory; + auto dstWeightMem = cost->bufferedWeightTensor.tensor->memArea.memory; + assert(srcWeightMem != dstWeightMem); + + weightsMemory = dstWeightMem; // Update source to use buffered weight memory + + // Calculate initial weight transfer cycles + int64_t weightCycles = _arch->Performance()->MemToMemCycles(dstWeightMem, srcWeightMem, initialSize); + weightCycles = std::max(weightCycles - slackCycles, int64_t(0)); + + int weightsSize = cost->npuWeightsTensor->AllocationSizeBytes(); + result.memory[srcWeightMem].access[AccessType::Weights].bytesRead += weightsSize; + result.memory[dstWeightMem].access[AccessType::Weights].bytesWritten += weightsSize; + + // Add cycles for Weight + Scale Transfer + result.npuCycles = std::max(cost->fullWeightTransferCycles - slackCycles + cost->slackBufferingCycles, cycles.opCycles + weightCycles); + } + else + { + // Calculate non-hidden LUT transfer cycles + lutTransferCycles = std::max(lutTransferCycles - slackCycles, int64_t(0)); + } + + // Add cycles for LUT Transfer + result.npuCycles += lutTransferCycles; + + // OFM write + auto ofm = schedOp->OFM(); + result.memory[ofm->tensor->memArea.memory].access[AccessType::FeatureMap].bytesWritten += byteAccess.ofmWrite; + + // IFM1 read + auto ifm = schedOp->IFM(0); + result.memory[ifm->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.ifmRead[0]; + + // IFM2 read + auto ifm2 = schedOp->TryIFM(1); + if ( ifm2 ) + { + result.memory[ifm2->tensor->memArea.memory].access[AccessType::FeatureMap].bytesRead += byteAccess.ifmRead[1]; + } + + // Weight read + if ( cost->npuWeightsTensor && access.constRead[0] > 0 ) + { + int encodedWeightsSize = cost->npuWeightsTensor->totalWeightBytes; + result.memory[weightsMemory].access[AccessType::Weights].bytesRead += int64_t(encodedWeightsSize) * access.weightsRefetch; + } + + // Scale read + if ( cost->npuWeightsTensor && access.constRead[1] > 0 ) + { + int encodedScaleSize = cost->npuWeightsTensor->AllocationSizeBytes() - cost->npuWeightsTensor->totalWeightBytes; + result.memory[weightsMemory].access[AccessType::Scales].bytesRead += int64_t(encodedScaleSize) * access.weightsRefetch; + } + + // Update memory-access cycles and find the maximum memory read cycle time + int64_t maxMemCycles = 0; + for ( auto &[mem, stats] : result.memory ) + { + float bandwidth = mem->Bandwidth(); + int64_t memBytes = 0; + for ( auto &[accType, acc] : stats.access ) + { + // compute cycles per accessType + int64_t bytes = acc.bytesRead + acc.bytesWritten; + memBytes += bytes; + int64_t accCycles = int64_t(float(bytes) / bandwidth); + acc.accessCycles = accCycles; + } + // get maximum cycles per memory + int64_t memCycles = int64_t(float(memBytes) / bandwidth); + maxMemCycles = std::max(maxMemCycles, memCycles); + } + + result.totalCycles = std::max(result.npuCycles, maxMemCycles); + return result; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/network_performance.hpp b/ethosu/regor/compiler/network_performance.hpp new file mode 100644 index 00000000..869acf99 --- /dev/null +++ b/ethosu/regor/compiler/network_performance.hpp @@ -0,0 +1,153 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "compiler/database.hpp" +#include "compiler/graph_optimiser.hpp" +#include "compiler/scheduler.hpp" + +#include + +namespace regor +{ + +/// +/// Performance information for a whole schedule +/// +enum AccessType +{ + Lut = 0, + FeatureMap = 1, + Weights = 2, + Scales = 3, +}; + +struct PerformanceResult +{ + + struct MemoryAccess + { + int64_t bytesRead = 0; + int64_t bytesWritten = 0; + int64_t accessCycles = 0; + + MemoryAccess &operator+=(const MemoryAccess &other) + { + this->bytesRead += other.bytesRead; + this->bytesWritten += other.bytesWritten; + this->accessCycles += other.accessCycles; + return *this; + } + }; + + struct MemoryAccesses + { + std::unordered_map access; + int64_t peakUsage = 0; + int64_t AccessCycles() const + { + int64_t cycles = 0; + for ( const auto &[type, acc] : access ) + { + cycles += acc.accessCycles; + } + return cycles; + } + + MemoryAccesses &operator+=(const MemoryAccesses &other) + { + for ( const auto &[type, acc] : other.access ) + { + access[type] += acc; + } + peakUsage = std::max(peakUsage, other.peakUsage); + return *this; + } + }; + + std::unordered_map memory; + int64_t npuCycles = 0; + int64_t cpuCycles = 0; + int64_t totalCycles = 0; + int64_t macCount = 0; + int64_t cpuOps = 0; + int64_t npuOps = 0; + int64_t cascadedOps = 0; + int64_t cascades = 0; + int64_t originalWeights = 0; + int64_t encodedWeights = 0; + + int Accesses() const + { + int accesses = 0; + for ( const auto &[archMem, stats] : memory ) + { + accesses += int(stats.access.size()); + } + return accesses; + } + + PerformanceResult &operator+=(const PerformanceResult &other) + { + // Not ideal for performance + for ( const auto &[arch, memoryStat] : other.memory ) + { + memory[arch] += memoryStat; + } + this->npuCycles += other.npuCycles; + this->cpuCycles += other.cpuCycles; + this->totalCycles += other.totalCycles; + this->macCount += other.macCount; + this->cpuOps += other.cpuOps; + this->npuOps += other.npuOps; + this->cascadedOps += other.cascadedOps; + this->cascades += other.cascades; + this->originalWeights += other.originalWeights; + this->encodedWeights += other.encodedWeights; + return *this; + } +}; + +/// +/// Whole-schedule performance calculation module +/// +class NetworkPerformance +{ +private: + Architecture *_arch; + const std::vector> &_ops; + +public: + NetworkPerformance(Architecture *arch, const std::vector> &ops); + +public: + PerformanceResult Measure(Schedule *schedule, OptimiserDatabase *optDb); + +private: + PerformanceResult EstimateFullOpPerformance( + SchedulerOperation *schedOp, SchedulerOpInfo *cost, SchedulerOperation *prevOp, SchedulerOpInfo *prevCost); + void AddToDatabase(const PerformanceResult &perf, const std::unique_ptr &schedOp, int opTable, + int columns, const std::unordered_set &memories, OptimiserDatabase *optDb); +}; + + + +} // namespace regor diff --git a/ethosu/regor/compiler/op_type.cpp b/ethosu/regor/compiler/op_type.cpp new file mode 100644 index 00000000..1d7ee8bf --- /dev/null +++ b/ethosu/regor/compiler/op_type.cpp @@ -0,0 +1,199 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "compiler/op_type.hpp" + +BEGIN_ENUM_TABLE(regor::OpType) + ADD_ENUM_NAME(None) + // TOSA equivalent ops + ADD_ENUM_NAME(ArgMax) + ADD_ENUM_NAME(AvgPool) + ADD_ENUM_NAME(Conv2D) + ADD_ENUM_NAME(Conv3D) + ADD_ENUM_NAME(DepthwiseConv2DBias) + ADD_ENUM_NAME(FullyConnected) + ADD_ENUM_NAME(MatMul) + ADD_ENUM_NAME(MaxPool) + ADD_ENUM_NAME(TransposeConv2D) + ADD_ENUM_NAME(Clamp) + ADD_ENUM_NAME(Sigmoid) + ADD_ENUM_NAME(Tanh) + ADD_ENUM_NAME(Add) + ADD_ENUM_NAME(Asr) + ADD_ENUM_NAME(And) + ADD_ENUM_NAME(Or) + ADD_ENUM_NAME(Xor) + ADD_ENUM_NAME(Div) + ADD_ENUM_NAME(LogicalAnd) + ADD_ENUM_NAME(SHL) + ADD_ENUM_NAME(SHR) + ADD_ENUM_NAME(LogicalOr) + ADD_ENUM_NAME(LogicalXor) + ADD_ENUM_NAME(Maximum) + ADD_ENUM_NAME(Minimum) + ADD_ENUM_NAME(Mul) + ADD_ENUM_NAME(Pow) + ADD_ENUM_NAME(Sub) + ADD_ENUM_NAME(LUT) + ADD_ENUM_NAME(Abs) + ADD_ENUM_NAME(Not) + ADD_ENUM_NAME(Ceil) + ADD_ENUM_NAME(CLZ) + ADD_ENUM_NAME(Exp) + ADD_ENUM_NAME(Floor) + ADD_ENUM_NAME(LogicalNot) + ADD_ENUM_NAME(Neg) + ADD_ENUM_NAME(Reciprocal) + ADD_ENUM_NAME(Rsqrt) + ADD_ENUM_NAME(Select) + ADD_ENUM_NAME(Equal) + ADD_ENUM_NAME(Greater) + ADD_ENUM_NAME(GreaterEqual) + ADD_ENUM_NAME(ReduceAny) + ADD_ENUM_NAME(ReduceAll) + ADD_ENUM_NAME(ReduceMax) + ADD_ENUM_NAME(ReduceMin) + ADD_ENUM_NAME(ReduceProduct) + ADD_ENUM_NAME(ReduceSum) + ADD_ENUM_NAME(Concat) + ADD_ENUM_NAME(Pad) + ADD_ENUM_NAME(Reshape) + ADD_ENUM_NAME(Reverse) + ADD_ENUM_NAME(Slice) + ADD_ENUM_NAME(Tile) + ADD_ENUM_NAME(Transpose) + ADD_ENUM_NAME(Gather) + ADD_ENUM_NAME(Scatter) + ADD_ENUM_NAME(Resize) + ADD_ENUM_NAME(Cast) + ADD_ENUM_NAME(Rescale) + ADD_ENUM_NAME(Identity) + ADD_ENUM_NAME(If) + ADD_ENUM_NAME(While) + // Regor Internal Operators + ADD_ENUM_NAME(MemoryCopy) + // Compatibility Operators + ADD_ENUM_NAME(AddN) + ADD_ENUM_NAME(Any) + ADD_ENUM_NAME(ArgMin) + ADD_ENUM_NAME(BatchMatMul) + ADD_ENUM_NAME(BatchToSpaceND) + ADD_ENUM_NAME(BidirectionalSequenceLstm) + ADD_ENUM_NAME(BidirectionalSequenceRnn) + ADD_ENUM_NAME(BlockLSTM) + ADD_ENUM_NAME(Call) + ADD_ENUM_NAME(Clip) + ADD_ENUM_NAME(ConcatEmbeddings) + ADD_ENUM_NAME(ConcatTFLite) + ADD_ENUM_NAME(Const) + ADD_ENUM_NAME(Conv2DBackpropInput) + ADD_ENUM_NAME(Conv2DBackpropInputSwitchedBias) + ADD_ENUM_NAME(Conv2DBias) + ADD_ENUM_NAME(Cos) + ADD_ENUM_NAME(Cumsum) + ADD_ENUM_NAME(Custom) + ADD_ENUM_NAME(CustomNpuOp) + ADD_ENUM_NAME(Delegate) + ADD_ENUM_NAME(Densify) + ADD_ENUM_NAME(DepthToSpace) + ADD_ENUM_NAME(Dequantize) + ADD_ENUM_NAME(Elu) + ADD_ENUM_NAME(EmbeddingLookup) + ADD_ENUM_NAME(EmbeddingLookupSparse) + ADD_ENUM_NAME(ExpandDims) + ADD_ENUM_NAME(FakeQuantWithMinMaxArgs) + ADD_ENUM_NAME(Fill) + ADD_ENUM_NAME(FloorDiv) + ADD_ENUM_NAME(FloorMod) + ADD_ENUM_NAME(GatherNd) + ADD_ENUM_NAME(GatherV2) + ADD_ENUM_NAME(HardSwish) + ADD_ENUM_NAME(HashtableLookup) + ADD_ENUM_NAME(L2Norm) + ADD_ENUM_NAME(L2Pool2D) + ADD_ENUM_NAME(LRN) + ADD_ENUM_NAME(LSHProjection) + ADD_ENUM_NAME(LeakyRelu) + ADD_ENUM_NAME(Less) + ADD_ENUM_NAME(LessEqual) + ADD_ENUM_NAME(Log) + ADD_ENUM_NAME(LogSoftmax) + ADD_ENUM_NAME(Lstm) + ADD_ENUM_NAME(MatrixDiag) + ADD_ENUM_NAME(MatrixSetDiag) + ADD_ENUM_NAME(Max) + ADD_ENUM_NAME(Mean) + ADD_ENUM_NAME(Min) + ADD_ENUM_NAME(MirrorPad) + ADD_ENUM_NAME(NonMaxSuppressionV4) + ADD_ENUM_NAME(NonMaxSuppressionV5) + ADD_ENUM_NAME(NotEqual) + ADD_ENUM_NAME(OneHot) + ADD_ENUM_NAME(Pack) + ADD_ENUM_NAME(PadV2) + ADD_ENUM_NAME(Placeholder) + ADD_ENUM_NAME(Prelu) + ADD_ENUM_NAME(Prod) + ADD_ENUM_NAME(Quantize) + ADD_ENUM_NAME(QuantizedAvgPool) + ADD_ENUM_NAME(QuantizedConv2D) + ADD_ENUM_NAME(QuantizedMatMul) + ADD_ENUM_NAME(QuantizedMaxPool) + ADD_ENUM_NAME(QuantizedReshape) + ADD_ENUM_NAME(Range) + ADD_ENUM_NAME(Rank) + ADD_ENUM_NAME(Relu) + ADD_ENUM_NAME(Relu6) + ADD_ENUM_NAME(ReluN1To1) + ADD_ENUM_NAME(ReluN) + ADD_ENUM_NAME(ResizeBilinear) + ADD_ENUM_NAME(ResizeNearestNeighbor) + ADD_ENUM_NAME(ReverseSequence) + ADD_ENUM_NAME(ReverseV2) + ADD_ENUM_NAME(Rnn) + ADD_ENUM_NAME(Round) + ADD_ENUM_NAME(ScatterNd) + ADD_ENUM_NAME(SegmentSum) + ADD_ENUM_NAME(SelectV2) + ADD_ENUM_NAME(Shape) + ADD_ENUM_NAME(SignBit) + ADD_ENUM_NAME(Sin) + ADD_ENUM_NAME(SkipGram) + ADD_ENUM_NAME(Softmax) + ADD_ENUM_NAME(SpaceToBatchND) + ADD_ENUM_NAME(SpaceToDepth) + ADD_ENUM_NAME(SparseToDense) + ADD_ENUM_NAME(Split) + ADD_ENUM_NAME(SplitV) + ADD_ENUM_NAME(Sqrt) + ADD_ENUM_NAME(Square) + ADD_ENUM_NAME(SquaredDifference) + ADD_ENUM_NAME(Squeeze) + ADD_ENUM_NAME(StridedSlice) + ADD_ENUM_NAME(SubgraphInput) + ADD_ENUM_NAME(Sum) + ADD_ENUM_NAME(Svdf) + ADD_ENUM_NAME(TopKV2) + ADD_ENUM_NAME(UnidirectionalSequenceLstm) + ADD_ENUM_NAME(UnidirectionalSequenceRnn) + ADD_ENUM_NAME(Unique) + ADD_ENUM_NAME(Unpack) + ADD_ENUM_NAME(Where) + ADD_ENUM_NAME(ZerosLike) + ADD_ENUM_NAME(LookupTable) +END_ENUM_TABLE() diff --git a/ethosu/regor/compiler/op_type.hpp b/ethosu/regor/compiler/op_type.hpp new file mode 100644 index 00000000..018344a0 --- /dev/null +++ b/ethosu/regor/compiler/op_type.hpp @@ -0,0 +1,290 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/bit_flags.hpp" + +#include +#include + +namespace regor +{ + +enum class OpType : uint16_t +{ + None = 0, + // TOSA equivalent ops + ArgMax, + AvgPool, + Conv2D, + Conv3D, + DepthwiseConv2DBias, + FullyConnected, + MatMul, + MaxPool, + TransposeConv2D, + Clamp, + Sigmoid, + Tanh, + Add, + Asr, + And, + Or, + Xor, + Div, + LogicalAnd, + SHL, + SHR, + LogicalOr, + LogicalXor, + Maximum, + Minimum, + Mul, + Pow, + Sub, + LUT, + Abs, + Not, + Ceil, + CLZ, + Exp, + Floor, + LogicalNot, + Neg, + Reciprocal, + Rsqrt, + Select, + Equal, + Greater, + GreaterEqual, + ReduceAny, + ReduceAll, + ReduceMax, + ReduceMin, + ReduceProduct, + ReduceSum, + Concat, + Pad, + Reshape, + Reverse, + Slice, + Tile, + Transpose, + Gather, + Scatter, + Resize, + Cast, + Rescale, + Identity, + If, + While, + + // Regor Internal Operators + MemoryCopy, + + // Compatibility Operators + AddN, + Any, + ArgMin, + BatchMatMul, + BatchToSpaceND, + BidirectionalSequenceLstm, + BidirectionalSequenceRnn, + BlockLSTM, + Call, + Clip, + ConcatEmbeddings, + ConcatTFLite, + Const, + Conv2DBackpropInput, + Conv2DBackpropInputSwitchedBias, + Conv2DBias, + Cos, + Cumsum, + Custom, + CustomNpuOp, + Delegate, + Densify, + DepthToSpace, + Dequantize, + Elu, + EmbeddingLookup, + EmbeddingLookupSparse, + ExpandDims, + FakeQuantWithMinMaxArgs, + Fill, + FloorDiv, + FloorMod, + GatherNd, + GatherV2, + HardSwish, + HashtableLookup, + L2Norm, + L2Pool2D, + LRN, + LSHProjection, + LeakyRelu, + Less, + LessEqual, + Log, + LogSoftmax, + Lstm, + MatrixDiag, + MatrixSetDiag, + Max, + Mean, + Min, + MirrorPad, + NonMaxSuppressionV4, + NonMaxSuppressionV5, + NotEqual, + OneHot, + Pack, + PadV2, + Placeholder, + Prelu, + Prod, + Quantize, + QuantizedAvgPool, + QuantizedConv2D, + QuantizedMatMul, + QuantizedMaxPool, + QuantizedReshape, + Range, + Rank, + Relu, + Relu6, + ReluN1To1, + ReluN, + ResizeBilinear, + ResizeNearestNeighbor, + ReverseSequence, + ReverseV2, + Rnn, + Round, + ScatterNd, + SegmentSum, + SelectV2, + Shape, + SignBit, + Sin, + SkipGram, + Softmax, + SpaceToBatchND, + SpaceToDepth, + SparseToDense, + Split, + SplitV, + Sqrt, + Square, + SquaredDifference, + Squeeze, + StridedSlice, + SubgraphInput, + Sum, + Svdf, + TopKV2, + UnidirectionalSequenceLstm, + UnidirectionalSequenceRnn, + Unique, + Unpack, + Where, + ZerosLike, + LookupTable, + ENUM_END +}; + +inline std::string OpTypeToString(const OpType type) +{ + return EnumToString(type); +} + +constexpr inline bool IsUnaryElementwise(OpType opType) +{ + return opType == OpType::Abs || opType == OpType::LeakyRelu || opType == OpType::CLZ || + opType == OpType::LogicalNot || opType == OpType::Not || opType == OpType::Neg; +} + +constexpr inline bool IsBinaryElementwise(OpType opType) +{ + return opType == OpType::Add || opType == OpType::Sub || opType == OpType::Mul || opType == OpType::Minimum || + opType == OpType::Maximum || opType == OpType::SHL || opType == OpType::SHR || opType == OpType::Div || + opType == OpType::LogicalAnd || opType == OpType::LogicalOr || opType == OpType::LogicalXor || + opType == OpType::Xor || opType == OpType::And || opType == OpType::Or || opType == OpType::Asr || + opType == OpType::Equal || opType == OpType::Greater || opType == OpType::GreaterEqual || opType == OpType::NotEqual; +} + +constexpr inline bool IsElementwise(OpType opType) +{ + return IsUnaryElementwise(opType) || IsBinaryElementwise(opType); +} + +constexpr inline bool IsDepthwise(OpType opType) +{ + return opType == OpType::DepthwiseConv2DBias; +} + +constexpr inline bool IsConvolution(OpType opType) +{ + return opType == OpType::Conv2D || opType == OpType::Conv2DBackpropInput || opType == OpType::Conv2DBackpropInputSwitchedBias || + opType == OpType::Conv2DBias || opType == OpType::DepthwiseConv2DBias; +} + +constexpr inline bool IsPooling(OpType opType) +{ + return opType == OpType::MaxPool || opType == OpType::AvgPool || opType == OpType::QuantizedAvgPool || opType == OpType::QuantizedMaxPool || + opType == OpType::ReduceSum || opType == OpType::Sum || opType == OpType::Min || opType == OpType::ArgMax; +} + +constexpr inline bool IsVectorProduct(OpType opType) +{ + return opType == OpType::FullyConnected || opType == OpType::BidirectionalSequenceLstm || opType == OpType::BidirectionalSequenceRnn || + opType == OpType::BlockLSTM || opType == OpType::Lstm || opType == OpType::MatMul || opType == OpType::Rnn || + opType == OpType::UnidirectionalSequenceLstm || opType == OpType::UnidirectionalSequenceRnn; +} + +constexpr inline bool IsDma(OpType opType) +{ + return opType == OpType::Scatter || opType == OpType::ScatterNd || opType == OpType::Gather || opType == OpType::GatherV2 || opType == OpType::GatherNd; +} + +constexpr inline bool IsActivation(OpType opType) +{ + return opType == OpType::Relu || opType == OpType::Relu6 || opType == OpType::ReluN || opType == OpType::ReluN1To1 || + opType == OpType::Prelu || opType == OpType::Clip || opType == OpType::Sigmoid || opType == OpType::Tanh || opType == OpType::LUT; +} + +constexpr inline bool IsConcatenation(OpType opType) +{ + return opType == OpType::Concat || opType == OpType::ConcatEmbeddings || opType == OpType::ConcatTFLite; +} + +constexpr inline bool IsVariadic(OpType opType) +{ + return IsConcatenation(opType) || opType == OpType::Pack || opType == OpType::Maximum || opType == OpType::Minimum || + opType == OpType::AddN || opType == OpType::Custom || opType == OpType::CustomNpuOp; +} + +constexpr inline bool IsReshape(OpType opType) +{ + // The Reshape like operations: Reshape, Squeeze, and ExpandDims + return opType == OpType::Reshape || opType == OpType::QuantizedReshape || opType == OpType::Squeeze || opType == OpType::ExpandDims; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/operation.cpp b/ethosu/regor/compiler/operation.cpp new file mode 100644 index 00000000..5ffb6281 --- /dev/null +++ b/ethosu/regor/compiler/operation.cpp @@ -0,0 +1,168 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "compiler/operation.hpp" + +#include "common/ordered_map.hpp" +#include "common/scaling.hpp" +#include "graph_builder.hpp" +#include "kernel.hpp" +#include "op_type.hpp" +#include "tensor.hpp" + +#include + +namespace regor +{ + +Operation::Operation(OpType opType) : _type(opType), _parameters({}), _rounding(RoundMode::AUTO) +{ + // Default 1x1 kernel for ops without a kernel + _kernel = std::make_unique(Point2i(1, 1), Point2i(1, 1), Point2i(1, 1)); +} + +Operation::Operation(const Operation &op) : Operation(op._type) +{ + _kernel = std::make_unique(*op._kernel.get()); + _parameters = op._parameters; + _passthrough = op._passthrough; +} + +TensorUsage Operation::UsageOfTensor(const Tensor *tensor) const +{ + for ( const auto &list : {_inputs.pairs(), _outputs.pairs()} ) + { + for ( const auto &pair : list ) + { + if ( tensor == pair.second.tensor.get() ) + { + return pair.first; + } + } + } + return TensorUsage::None; +} + +Tensor *Operation::IFM(int index) const +{ + auto conn = Input(MakeTensorUsage(TensorUsage::IFM, index)); + return conn ? conn->tensor.get() : nullptr; +} + +Tensor *Operation::OFM() const +{ + return _outputs.at(TensorUsage::OFM).tensor.get(); +} + +void Operation::CopyInput(TensorUsage usage, const TensorConnection &tensorConnection) +{ + ConnectInput(usage, tensorConnection.tensor) + .Set(tensorConnection.shape) + .Set(tensorConnection.slice) + .Set(tensorConnection.quantization) + .Set(tensorConnection.transpose); +} + +TensorConnection &Operation::ConnectInput(TensorUsage usage, const std::shared_ptr &tensor) +{ + // Must create the new connection before destroying whatever it replaces, + // because the existing connection (if present) might be the last remaining reference to this operation. + tensor->AddReader(shared_from_this()); + + if ( _inputs.contains(usage) && (_inputs[usage].tensor != tensor) ) + { + _inputs[usage].tensor->RemoveReader(shared_from_this()); + } + _inputs[usage].tensor = tensor; + _inputs[usage].shape = tensor->StorageShape(); + return _inputs[usage]; +} + +void Operation::CopyOutput(TensorUsage usage, const TensorConnection &tensorConnection) +{ + ConnectOutput(usage, tensorConnection.tensor) + .Set(tensorConnection.shape) + .Set(tensorConnection.slice) + .Set(tensorConnection.quantization) + .Set(tensorConnection.transpose); +} + +TensorConnection &Operation::ConnectOutput(TensorUsage usage, const std::shared_ptr &tensor) +{ + // Must create the new connection before destroying whatever it replaces, + // because the existing connection (if present) might be the last remaining reference to this operation. + tensor->AddWriter(shared_from_this()); + + if ( _outputs.contains(usage) && (_outputs[usage].tensor != tensor) ) + { + _outputs[usage].tensor->RemoveWriter(shared_from_this()); + } + _outputs[usage].tensor = tensor; + _outputs[usage].shape = tensor->StorageShape(); + + return _outputs[usage]; +} + +void Operation::Disconnect() +{ + // This operation might be about to remove the last remaining references to itself, + // so it must hold onto this one until it is finished disconnecting. + auto self = shared_from_this(); + + for ( auto &conn : _inputs ) + { + conn.tensor->RemoveReader(self); + } + _inputs.clear(); + + for ( auto &conn : _outputs ) + { + conn.tensor->RemoveWriter(self); + } + _outputs.clear(); +} + +bool Operation::IsDisconnected() const +{ + return _inputs.empty() && _outputs.empty(); +} + +bool Operation::HasScaling() const +{ + bool scaled = true; + for ( const auto &fm : _inputs.pairs() ) + { + if ( fm.first == TensorUsage::IFM || fm.first == TensorUsage::IFM1 || fm.first == TensorUsage::OFM ) + { + if ( fm.second.quantization.scales.empty() ) + { + return false; + } + } + } + return scaled; +} + +void Operation::SetZeroPoint(GraphApi::GraphTensorUsage graphUsage, double zeroPoint) +{ + auto usage = GraphAPIUsageToTensorUsage(graphUsage); + auto &connections = (usage & regor::TensorUsage::TypeMask) == regor::TensorUsage::OFM ? _outputs : _inputs; + connections.at(usage).quantization.zeroPoints = {int64_t(zeroPoint)}; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/operation.hpp b/ethosu/regor/compiler/operation.hpp new file mode 100644 index 00000000..1c23b6f5 --- /dev/null +++ b/ethosu/regor/compiler/operation.hpp @@ -0,0 +1,234 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "attributes.hpp" +#include "common/ordered_map.hpp" +#include "common/reverse_type.hpp" +#include "common/scaling.hpp" +#include "common/transpose_type.hpp" +#include "include/graphapi.hpp" +#include "kernel.hpp" +#include "op_type.hpp" +#include "quantization.hpp" +#include "tensor.hpp" + +#include + +namespace regor +{ + +enum class RoundMode : uint8_t +{ + DBL = 0, + TRUNCATE = 1, + NATURAL = 2, + TRUNCATE_TO_LOWER = 3, + DOUBLE_ASYMMETRIC = 4, + SYMMETRIC = 5, + AUTO = 0xff +}; + +// Parameters that apply only to particular operation types +union OpTypeParameters +{ + struct + { + float alpha; + } leaky_relu; + + struct + { + float beta; + } softmax; + + struct + { + int axis; + } concat; + + struct + { + int axis; + } pack_unpack; + + struct + { + int begin_mask; + int end_mask; + int ellipsis_mask; + int new_axis_mask; + int shrink_axis_mask; + } strided_slice; + + struct + { + bool alignCorners; + bool halfPixelCenters; + } resize; +}; + +struct TensorSlice +{ + Shape offset; + Shape shape; +}; + +struct TensorConnection +{ + std::shared_ptr tensor; + Shape shape; + // For operations accessing a slice of the tensor: + // Writing: Concat, ConcatTFLite, and Pack + // Reading: Split, SplitV, Unpack, Slice, and StridedSlice + TensorSlice slice; + Quantization quantization; + TransposeType transpose = TransposeType::None; + ReverseType reverse = ReverseType::None; + + TensorConnection &Set(const Shape &s) + { + shape = s; + return *this; + } + TensorConnection &Set(const TensorSlice &s) + { + slice = s; + return *this; + } + TensorConnection &Set(const Quantization &q) + { + quantization = q; + return *this; + } + TensorConnection &Set(const TransposeType &t) + { + transpose = t; + return *this; + } + TensorConnection &Set(const ReverseType &r) + { + reverse = r; + return *this; + } +}; + + +/// +/// Graph Operation representation +/// +class Operation : public std::enable_shared_from_this, public GraphApi::GraphOperation +{ +private: + ordered_map _inputs; + ordered_map _outputs; + OpType _type; + std::unique_ptr _kernel; + DynamicRef _attr; + OpTypeParameters _parameters; // TODO: remove me + RoundMode _rounding; + const void *_passthrough = nullptr; // Original flatbuffer description of this op (if it was loaded from one) + +public: + Operation(OpType opType); + Operation(const Operation &op); + OpType Type() const { return _type; } + + const ordered_map &Outputs() const { return _outputs; } + const ordered_map &Inputs() const { return _inputs; } + + Tensor *IFM(int index) const; + Tensor *OFM() const; + + TensorConnection *Input(TensorUsage usage) { return _inputs.try_ref(usage); } + const TensorConnection *Input(TensorUsage usage) const { return _inputs.try_ref(usage); } + TensorConnection *Output(TensorUsage usage) { return _outputs.try_ref(usage); } + const TensorConnection *Output(TensorUsage usage) const { return _outputs.try_ref(usage); } + + TensorUsage UsageOfTensor(const Tensor *tensor) const; + const class Kernel *Kernel() const { return _kernel.get(); } + class Kernel *Kernel() { return _kernel.get(); } + void SetKernel(std::unique_ptr kernel) { _kernel = std::move(kernel); } + + const OpTypeParameters &Parameters() const { return _parameters; } + OpTypeParameters &Parameters() { return _parameters; } + + RoundMode Rounding() const { return _rounding; } + void SetRounding(RoundMode rounding) { _rounding = rounding; } + + const void *Passthrough() const { return _passthrough; } + void SetPassthrough(const void *passthrough) { _passthrough = passthrough; } + + void CopyInput(TensorUsage usage, const TensorConnection &tensorConnection); + TensorConnection &ConnectInput(TensorUsage usage, const std::shared_ptr &tensor); + int CountInputs(TensorUsage usage) const { return CountUsage(_inputs, usage); } + + void CopyOutput(TensorUsage usage, const TensorConnection &tensorConnection); + TensorConnection &ConnectOutput(TensorUsage usage, const std::shared_ptr &tensor); + int CountOutputs(TensorUsage usage) const { return CountUsage(_outputs, usage); } + + void Disconnect(); + bool IsDisconnected() const; + bool HasScaling() const; + + // Inherited via GraphOperation + void SetZeroPoint(GraphApi::GraphTensorUsage tensor, double zeroPoint) override; + + template + TYPE *Attribute() + { + if ( !_attr ) + { + _attr = CreateAttribute(TypeHash::HASH); + } + else if ( _attr.Info()->Hash() != TypeHash::HASH ) + { + throw std::runtime_error("attribute already assigned for this operator"); + } + + return static_cast(_attr.Instance()); + } + + DynamicRef *AttributeByKey(uint32_t hash) + { + if ( !_attr ) + { + _attr = CreateAttribute(hash); + } + return &_attr; + } + + const DynamicRef &AttributeRef() const { return _attr; } + +private: + int CountUsage(const ordered_map &list, TensorUsage usage) const + { + int count = 0; + for ( const auto &pair : list.pairs() ) + { + if ( (pair.first & TensorUsage::TypeMask) == usage ) + { + count++; + } + } + return count; + } +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/operation_util.hpp b/ethosu/regor/compiler/operation_util.hpp new file mode 100644 index 00000000..63ef2e74 --- /dev/null +++ b/ethosu/regor/compiler/operation_util.hpp @@ -0,0 +1,242 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/architecture.hpp" +#include "common/buffer_view.hpp" +#include "operation.hpp" +#include "quantization.hpp" +#include "tensor.hpp" + +namespace regor +{ + +inline std::shared_ptr CreateConstTensor( + const std::string &name, DataType type, const std::shared_ptr &buffer, const Shape *shape = nullptr) +{ + Shape tensorShape; + if ( shape == nullptr ) + { + tensorShape = Shape(DataTypeElements(type, buffer->Size())); + } + else + { + tensorShape = *shape; + } + auto tensor = std::make_shared(name, type, tensorShape, buffer); + return tensor; +} + +template +std::shared_ptr CreateConstTensor(const std::string &name, T value) +{ + auto buf = std::make_shared(std::vector{value}); + return CreateConstTensor(name, DataTypeOf::value, buf); +} + +template +std::shared_ptr NewBufferFromView(const BufferView &src) +{ + auto size = src.ViewShape().Elements(); + auto buf = std::make_shared(size, static_cast(nullptr)); + auto bufView = BufferView(buf, src); + const auto srcData = src.Values(); + auto dstData = bufView.template WritableValues(); + for ( int i = 0; i < size; i++ ) + { + dstData[i] = srcData[i]; + } + return buf; +} + +inline std::shared_ptr CreateSliceCopy(const std::string &name, const Tensor *src, const TensorSlice &slice) +{ + assert(src->IsConstant()); + auto sliceView = src->View().SubView(slice.offset, slice.shape); + std::shared_ptr buffer; + switch ( src->Type() ) + { + case DataType::Int8: + buffer = NewBufferFromView(sliceView); + break; + case DataType::UInt8: + buffer = NewBufferFromView(sliceView); + break; + case DataType::Int16: + buffer = NewBufferFromView(sliceView); + break; + case DataType::Int32: + buffer = NewBufferFromView(sliceView); + break; + default: + assert(false); + break; + } + auto tensor = CreateConstTensor(name, src->Type(), buffer, &slice.shape); + return tensor; +} + +inline Operation *CreateLUT(const std::shared_ptr &ifm, const std::shared_ptr &lut, const Quantization &ifmQuantization, + const Quantization &ofmQuantization, DataType dtype = DataType::None, const Shape *ifmShape = nullptr, + std::shared_ptr ofm = nullptr, TensorSlice ifmSlice = {}, TensorSlice ofmSlice = {}) +{ + auto op = std::make_shared(OpType::LUT); + if ( dtype == DataType::None ) + { + dtype = lut->Type(); + } + if ( ifmShape == nullptr ) + { + ifmShape = &ifm->StorageShape(); + } + op->ConnectInput(TensorUsage::IFM, ifm).Set(*ifmShape).Set(ifmQuantization).Set(ifmSlice); + + op->ConnectInput(TensorUsage::LUT, lut); + if ( ofm == nullptr ) + { + ofm = std::make_shared(ifm->Name() + "/lut", dtype); + ofm->SetStorageShape(*ifmShape); + } + op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofm->StorageShape()).Set(ofmQuantization).Set(ofmSlice); + return op.get(); +} + +inline Operation *CreateDepthwiseMaxpool(const std::shared_ptr &ifm, const Shape &ifmShape, + const Quantization &ifmQuantization, const Quantization &ofmQuantization) +{ + auto op = std::make_shared(OpType::MaxPool); + int height = ifmShape.ElementsWH(); + int width = ifmShape.Depth(); + auto kernel = std::make_unique(Point2i(width, 1), Point2i(1, 1), Point2i(1, 1), 1); + auto ofm = std::make_shared(ifm->Name() + "/maxpool", ifm->Type()); + ofm->SetStorageShape(Shape(1, ifmShape.Height(), ifmShape.Width(), 1)); + op->SetKernel(std::move(kernel)); + + op->ConnectInput(TensorUsage::IFM, ifm).Set(ifmQuantization); + op->Input(TensorUsage::IFM)->shape = Shape(1, height, width, 1); + op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofmQuantization); + op->Output(TensorUsage::OFM)->shape = Shape(1, height, 1, 1); + return op.get(); +} + +inline Operation *CreateReduceSum(const std::shared_ptr &ifm, const Quantization &ifmQuantization, const Quantization &ofmQuantization) +{ + const auto &ifmShape = ifm->StorageShape(); + auto op = std::make_shared(OpType::ReduceSum); + auto ofm = std::make_shared(ifm->Name() + "/reducesum", DataType::Int32); + ofm->SetStorageShape(Shape(1, ifmShape.Height(), ifmShape.Width(), 1)); + op->ConnectInput(TensorUsage::IFM, ifm).Set(ifmQuantization); + op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofmQuantization); + return op.get(); +} + +inline Operation *CreateElementwise(OpType type, const std::shared_ptr &ifm, const std::shared_ptr &ifm2, + const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization, + DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr) +{ + assert(IsElementwise(type)); + auto op = std::make_shared(type); + op->ConnectInput(TensorUsage::IFM, ifm).Set(ifmQuantization); + if ( ifmShape ) op->Input(TensorUsage::IFM)->shape = *ifmShape; + if ( ifm2 ) + { + op->ConnectInput(TensorUsage::IFM1, ifm2).Set(ifm2Quantization); + if ( ifm2Shape ) op->Input(TensorUsage::IFM1)->shape = *ifm2Shape; + } + + if ( dtype == DataType::None ) dtype = ifm->Type(); + + Shape ofmShape = op->Input(TensorUsage::IFM)->shape; + // If reverse operands use ifm2 shape as ofm shape + if ( ifm2 && ((ofmShape.Elements() == 1 && ifm->IsConstant()) || ofmShape.IsSubShapeOf(op->Input(TensorUsage::IFM1)->shape)) ) + { + ofmShape = ifm2->StorageShape(); + } + + auto ofm = std::make_shared(ifm->Name() + "/" + OpTypeToString(type), dtype); + ofm->SetStorageShape(ofmShape); + op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofmQuantization); + return op.get(); +} + +inline Operation *CreateBinaryElementwise(OpType type, const std::shared_ptr &ifm, const std::shared_ptr &ifm2, + const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization, + DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr) +{ + assert(IsBinaryElementwise(type)); + return CreateElementwise(type, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape); +} + +inline Operation *CreateUnaryElementwise(OpType type, const std::shared_ptr &ifm, const Quantization &ifmQuantization, + const Quantization &ofmQuantization, DataType dtype = DataType::None, const Shape *ifmShape = nullptr) +{ + assert(IsUnaryElementwise(type)); + return CreateElementwise(type, ifm, nullptr, ifmQuantization, {}, ofmQuantization, dtype, ifmShape); +} + +inline Operation *CreateClz(const std::shared_ptr &ifm, const Quantization &ifmQuantization, + const Quantization &ofmQuantization, DataType dtype = DataType::None, const Shape *ifmShape = nullptr) +{ + return CreateUnaryElementwise(OpType::CLZ, ifm, ifmQuantization, ofmQuantization, dtype, ifmShape); +} + +inline Operation *CreateAdd(const std::shared_ptr &ifm, const std::shared_ptr &ifm2, + const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization, + DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr) +{ + return CreateBinaryElementwise(OpType::Add, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape); +} + +inline Operation *CreateMul(const std::shared_ptr &ifm, const std::shared_ptr &ifm2, + const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization, + DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr) +{ + return CreateBinaryElementwise(OpType::Mul, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape); +} + +inline Operation *CreateSub(const std::shared_ptr &ifm, const std::shared_ptr &ifm2, + const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization, + DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr) +{ + return CreateBinaryElementwise(OpType::Sub, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape); +} + +inline Operation *CreateShl(const std::shared_ptr &ifm, const std::shared_ptr &ifm2, + const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization, + DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr) +{ + return CreateBinaryElementwise(OpType::SHL, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape); +} + +inline Operation *CreateAsr(const std::shared_ptr &ifm, const std::shared_ptr &ifm2, + const Quantization &ifmQuantization, const Quantization &ifm2Quantization, const Quantization &ofmQuantization, + DataType dtype = DataType::None, const Shape *ifmShape = nullptr, const Shape *ifm2Shape = nullptr) +{ + return CreateBinaryElementwise(OpType::Asr, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization, dtype, ifmShape, ifm2Shape); +} + +inline Operation *CreateRescaleAdd(const std::shared_ptr &ifm, const std::shared_ptr &ifm2, const Quantization &ifmQuantization, + const Quantization &ifm2Quantization, const Quantization &ofmQuantization, int32_t scale, int shift) +{ + auto op = CreateBinaryElementwise(OpType::Add, ifm, ifm2, ifmQuantization, ifm2Quantization, ofmQuantization); + op->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(scale, shift)); + return op; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/optimiser_utils.cpp b/ethosu/regor/compiler/optimiser_utils.cpp new file mode 100644 index 00000000..4090d9c9 --- /dev/null +++ b/ethosu/regor/compiler/optimiser_utils.cpp @@ -0,0 +1,143 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +#include "optimiser_utils.hpp" + +#include "operation.hpp" +namespace regor::GraphOptimisation +{ + +// Find specified tensor in Inputs() / Outputs() vectors. +// returns true if found in given vector. +bool IsTensorInVector(const std::vector> &tensorVec, const Tensor *const tensorToFind) +{ + auto pos = std::find_if( + tensorVec.begin(), tensorVec.end(), [&](const std::shared_ptr &t) { return t.get() == tensorToFind; }); + + return (pos != tensorVec.end()); +} + +// Insert a MemoryCopy operation after given ifm tensor. Returns a copy op shared_ptr. +// Will make a clone of ifm as ofm and connects any other consumers of the ifm to it. +std::shared_ptr InsertCopyOpAfterTensor(const std::shared_ptr &ifm, const Quantization &quantization) +{ + std::shared_ptr copyTensor = ifm->Clone(); + auto copyOp = std::make_shared(OpType::MemoryCopy); + copyOp->ConnectInput(TensorUsage::IFM0, ifm).Set(quantization); + auto name = ifm->Name(); + name.append("_copy"); + copyTensor->SetName(name); + copyOp->ConnectOutput(TensorUsage::OFM, copyTensor).Set(quantization); + + std::vector> ifmReaders(ifm->Readers()); + for ( const auto &opReader : ifmReaders ) + { + auto *cons = opReader.get(); + if ( cons != copyOp.get() ) + { + auto idx = 0; + auto usage = MakeTensorUsage(TensorUsage::IFM, 0); + auto *consIfmConn = cons->Input(usage); + + while ( consIfmConn != nullptr ) + { + if ( consIfmConn->tensor.get() == ifm.get() ) + { + cons->ConnectInput(usage, copyTensor).Set(quantization); + } + usage = MakeTensorUsage(TensorUsage::IFM, ++idx); + consIfmConn = cons->Input(usage); + } + } + } + return copyOp; +} + +// Connects output to operations in given list. Will not replace connection shape. +// Parameters: +// - producerList: List of producers. +// - tensorToReplace: if OFM on consumer match this tensor, replace it. +// - newTensor: The new output tensor to connect. +void ReplaceProducerOutput(std::vector> producerList, const Tensor *const tensorToReplace, + std::shared_ptr newTensor) +{ + // Not passed by reference. Original can be modified in loop. + for ( const auto &producer : producerList ) + { + Operation *prod = producer.get(); + auto idx = 0; + auto usage = MakeTensorUsage(TensorUsage::OFM, 0); + auto prodOfmConn = prod->Output(usage); + + while ( prodOfmConn != nullptr ) + { + if ( prodOfmConn->tensor.get() == tensorToReplace ) + { + // Do not want to replace the shape. Only the tensor and add writers. + // As ConnectOutput but do not replace shape. + newTensor->AddWriter(prod->shared_from_this()); + if ( prodOfmConn->tensor != newTensor ) + { + prodOfmConn->tensor->RemoveWriter(prod->shared_from_this()); + } + prodOfmConn->tensor = newTensor; + } + usage = MakeTensorUsage(TensorUsage::OFM, ++idx); + prodOfmConn = prod->Output(usage); + } + } +} + + +// Connects input to operations in given list. Will not replace connection shape. +// Parameters: +// - exemptOperation: operation to exempt. +// - consumerList: List of consumers. +// - tensorToReplace: if IFM on consumer match this tensor, replace it. +// - newTensor: The new input tensor to connect. +void ReplaceConsumerInput(const Operation *const exemptOperation, std::vector> consumerList, + const Tensor *const tensorToReplace, std::shared_ptr newTensor) +{ + // Not passed by reference. Original can be modified in loop. + for ( const auto &consumer : consumerList ) + { + Operation *cons = consumer.get(); + auto idx = 0; + auto usage = MakeTensorUsage(TensorUsage::IFM, 0); + auto *consIfmConn = cons->Input(usage); + + while ( consIfmConn != nullptr ) + { + if ( consIfmConn->tensor.get() == tensorToReplace && cons != exemptOperation ) + { + // Do not want to replace the shape. Only the tensor and add writers. + // As ConnectInput but do not replace shape. + newTensor->AddReader(cons->shared_from_this()); + if ( consIfmConn->tensor != newTensor ) + { + consIfmConn->tensor->RemoveReader(cons->shared_from_this()); + } + consIfmConn->tensor = newTensor; + } + usage = MakeTensorUsage(TensorUsage::IFM, ++idx); + consIfmConn = cons->Input(usage); + } + } +} + + +} // namespace regor::GraphOptimisation diff --git a/ethosu/regor/compiler/optimiser_utils.hpp b/ethosu/regor/compiler/optimiser_utils.hpp new file mode 100644 index 00000000..94d3efcf --- /dev/null +++ b/ethosu/regor/compiler/optimiser_utils.hpp @@ -0,0 +1,55 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "quantization.hpp" +#include "tensor.hpp" + +#include + +namespace regor::GraphOptimisation +{ + +// Find specified tensor in Inputs() / Outputs() vectors. +// returns true if found in given vector. +bool IsTensorInVector(const std::vector> &tensorVec, const Tensor *const tensorToFind); + +// Insert a MemoryCopy operation after given ifm tensor. Returns a copy op shared_ptr. +// Will make a clone of ifm as ofm and connects any other consumers of the ifm to it. +std::shared_ptr InsertCopyOpAfterTensor(std::shared_ptr const &ifm, const Quantization &quantization); + + +// Connects output to operations in given list. Will not replace connection shape. +// Parameters: +// - producerList: List of producers. +// - tensorToReplace: if OFM on consumer match this tensor, replace it. +// - newTensor: The new output tensor to connect. +void ReplaceProducerOutput(std::vector> producerList, const Tensor *const tensorToReplace, + std::shared_ptr newTensor); + +// Connects input to operations in given list. Will not replace connection shape. +// Parameters: +// - exemptOperation: operation to exempt. +// - consumerList: List of consumers. +// - tensorToReplace: if IFM on consumer match this tensor, replace it. +// - newTensor: The new input tensor to connect. +void ReplaceConsumerInput(const Operation *const exemptOperation, std::vector> consumerList, + const Tensor *const tensorToReplace, std::shared_ptr newTensor); + +} // namespace regor::GraphOptimisation diff --git a/ethosu/regor/compiler/quantization.cpp b/ethosu/regor/compiler/quantization.cpp new file mode 100644 index 00000000..0bc4d4c8 --- /dev/null +++ b/ethosu/regor/compiler/quantization.cpp @@ -0,0 +1,59 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "quantization.hpp" + +#include "common/common.hpp" + +namespace regor +{ + +std::string Quantization::ToString() const +{ + std::vector scale; + for ( const QuantizedScale &s : scales ) + { + scale.push_back(fmt::format("(scale:{}, shift:{})", s.scale, s.shift)); + } + return fmt::format("scale: [{}], zero_point: [{}], quantMin: [{}], quantMax: [{}], dimension: {}, force_zero_point: {}", + fmt::join(scale, ", "), fmt::join(zeroPoints, ", "), fmt::join(quantMin, ", "), fmt::join(quantMax, ", "), dimension, forceZeroPoint); +} + +bool Quantization::operator==(const Quantization &rhs) const +{ + return std::tie(scales, zeroPoints, quantMin, quantMax, dimension, forceZeroPoint) == + std::tie(rhs.scales, rhs.zeroPoints, rhs.quantMin, rhs.quantMax, rhs.dimension, rhs.forceZeroPoint); +} + +bool Quantization::operator!=(const Quantization &rhs) const +{ + return !(*this == rhs); +} + +const Quantization &Quantization::Unit() +{ + static Quantization unitQuantization; + if ( unitQuantization.scales.empty() ) + { + unitQuantization.scales.emplace_back(QuantizedScale{1, 0}); + unitQuantization.zeroPoints.emplace_back(0); + } + return unitQuantization; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/quantization.hpp b/ethosu/regor/compiler/quantization.hpp new file mode 100644 index 00000000..edc14d62 --- /dev/null +++ b/ethosu/regor/compiler/quantization.hpp @@ -0,0 +1,94 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "common/scaling.hpp" + +#include + +namespace regor +{ + +enum class QuantizationType +{ + TFLITE, // TFLite-specific rescale in backend + EXPLICIT, // Explicit scaling +}; + +class Quantization +{ +public: + QuantizationType type = QuantizationType::EXPLICIT; + std::vector scales; + std::vector zeroPoints; + std::vector quantMin; + std::vector quantMax; + int dimension = 0; + bool forceZeroPoint = false; + +public: + Quantization() = default; + Quantization(Quantization &&other) noexcept { *this = std::move(other); } + Quantization(const Quantization &other) { *this = other; } + + static const Quantization &Unit(); + bool operator==(const Quantization &rhs) const; + bool operator!=(const Quantization &rhs) const; + std::string ToString() const; + bool IsValid() const { return !zeroPoints.empty() && !scales.empty(); } + bool EqualScales(const Quantization &other) const + { + return other.scales == scales && other.zeroPoints == zeroPoints; + } + explicit operator bool() const { return IsValid(); } + + Quantization &operator=(const Quantization &other) + { + if ( this != &other ) + { + type = other.type; + scales = other.scales; + zeroPoints = other.zeroPoints; + quantMin = other.quantMin; + quantMax = other.quantMax; + dimension = other.dimension; + forceZeroPoint = other.forceZeroPoint; + } + return *this; + } + + Quantization &operator=(Quantization &&other) noexcept + { + if ( this != &other ) + { + type = other.type; + scales = std::move(other.scales); + zeroPoints = std::move(other.zeroPoints); + quantMin = std::move(other.quantMin); + quantMax = std::move(other.quantMax); + dimension = other.dimension; + forceZeroPoint = other.forceZeroPoint; + } + return *this; + } +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/raw_writer.cpp b/ethosu/regor/compiler/raw_writer.cpp new file mode 100644 index 00000000..22a5b216 --- /dev/null +++ b/ethosu/regor/compiler/raw_writer.cpp @@ -0,0 +1,260 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "raw_writer.hpp" + +#include "common/logging.hpp" + +#include +#include +#include + +#include "include/regor.h" + + +namespace regor +{ + +static constexpr int WEIGHTS_REGION = 0; +static constexpr int SCRATCH_REGION = 1; +static constexpr int SCRATCH_FAST_REGION = 2; +static constexpr int INPUT_REGION = SCRATCH_REGION; +static constexpr int OUTPUT_REGION = SCRATCH_REGION; + +std::vector, size_t>> RawWriter::Serialise( + const std::vector> &graphs, const std::vector> &tensor_address_maps) +{ + if ( graphs.size() != 1 ) + { + throw std::invalid_argument("RawWriter expects 1 graph"); + } + const auto &graph = graphs[0]; + + std::vector operations; + graph->GetAllOperations(operations); + if ( operations.size() != 1 ) + { + throw std::invalid_argument("RawWriter expects graph with 1 operation"); + } + + if ( tensor_address_maps.size() != 1 ) + { + throw std::invalid_argument("RawWriter expects 1 tensor address map"); + } + const auto &tensor_address_map = tensor_address_maps[0]; + + const Operation *customNpuOp = operations[0]; + if ( customNpuOp->Type() != OpType::CustomNpuOp ) + { + throw std::invalid_argument("RawWriter expects graph with 1 CustomNpuOp"); + } + + const auto &graphInputs = graph->Inputs(); + std::unordered_set> inputsSet(graphInputs.begin(), graphInputs.end()); + + const auto &graphOutputs = graph->Outputs(); + std::unordered_set> outputsSet(graphOutputs.begin(), graphOutputs.end()); + + // ethos_u_command_stream in TFLite format + auto commandStreamTensorConnection = customNpuOp->Input(MakeTensorUsage(TensorUsage::Params, 0)); + auto commandStreamTensor = commandStreamTensorConnection->tensor.get(); + SerialiseCommandStreamTensor(commandStreamTensor); + + // read_only in TFLite format + auto readOnlyTensorConnection = customNpuOp->Input(MakeTensorUsage(TensorUsage::Params, 1)); + auto readOnlyTensor = readOnlyTensorConnection->tensor.get(); + SerialiseReadOnlyTensor(readOnlyTensor); + + // scratch in TFLite format + auto featureMapTensorConnection = customNpuOp->Input(MakeTensorUsage(TensorUsage::State, 0)); + auto featureMapTensor = featureMapTensorConnection->tensor.get(); + SerialiseScratchTensor(featureMapTensor, tensor_address_map.at(featureMapTensor)); + + // scratch/scratch_fast TFLite format + auto stagingTensorConnection = customNpuOp->Input(MakeTensorUsage(TensorUsage::State, 1)); + auto stagingTensor = stagingTensorConnection->tensor.get(); + if ( stagingTensor == featureMapTensor ) + { + SerialiseScratchTensor(stagingTensor, tensor_address_map.at(stagingTensor)); + } + else + { + SerialiseScratchFastTensor(stagingTensor, tensor_address_map.at(stagingTensor)); + } + + for ( const auto &[tensorUsage, tensorConnection] : customNpuOp->Inputs().pairs() ) + { + auto inputTensor = tensorConnection.tensor.get(); + if ( IsIFM(tensorUsage) && !inputTensor->IsConstant() ) + { + // Serialise input tensor + SerialiseInputTensor(inputTensor, tensor_address_map.at(inputTensor)); + } + } + + for ( const auto &[tensorUsage, tensorConnection] : customNpuOp->Outputs().pairs() ) + { + auto outputTensor = tensorConnection.tensor.get(); + if ( IsOFM(tensorUsage) ) + { + // Serialise output tensor + SerialiseOutputTensor(outputTensor, tensor_address_map.at(outputTensor)); + } + } + + return std::move(_raw); +} + +void RawWriter::SerialiseCommandStreamTensor(const Tensor *tensor) +{ + assert(tensor->IsConstant()); + + // command_stream buffer and buffer size + auto blob = tensor->View().Buffer()->Data(); + auto blobSize = tensor->View().Buffer()->Size(); + + const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t) + blobSize; + auto serialisedTensor = std::make_unique(serialisedTensorSize); + + // Initialise header + regor_raw_tensor_header_t header; + header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_COMMAND_STREAM; + header.tensor.command_stream.size = blobSize; + + // Copy header + std::copy_n(reinterpret_cast(&header), sizeof(header), serialisedTensor.get()); + + // Copy blob to right after header + std::copy_n(blob, blobSize, serialisedTensor.get() + sizeof(regor_raw_tensor_header_t)); + + _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize); +} + +void RawWriter::SerialiseReadOnlyTensor(const Tensor *tensor) +{ + assert(tensor->IsConstant()); + + // read_only buffer and buffer size + auto blob = tensor->View().Buffer()->Data(); + auto blobSize = tensor->View().Buffer()->Size(); + + const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t) + blobSize; + auto serialisedTensor = std::make_unique(serialisedTensorSize); + + // Initialise read_only header + regor_raw_tensor_header_t header; + header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_READ_ONLY; + header.tensor.read_only.region = WEIGHTS_REGION; + header.tensor.read_only.size = blobSize; + + // Copy header + std::copy_n(reinterpret_cast(&header), sizeof(header), serialisedTensor.get()); + + // Copy blob to right after header + std::copy_n(blob, blobSize, serialisedTensor.get() + sizeof(regor_raw_tensor_header_t)); + + _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize); +} + +void RawWriter::SerialiseScratchTensor(const Tensor *tensor, Address address) +{ + assert(!tensor->IsConstant()); + + const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t); + auto serialisedTensor = std::make_unique(serialisedTensorSize); + + // Initialise scratch header + regor_raw_tensor_header_t header; + header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_SCRATCH; + header.tensor.scratch.region = SCRATCH_REGION; + header.tensor.scratch.size = DataTypeStorageSizeBytes(tensor->Type(), tensor->StorageShape().Elements()); + header.tensor.scratch.address = address; + + // Copy header + std::copy_n(reinterpret_cast(&header), sizeof(header), serialisedTensor.get()); + + _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize); +} + +void RawWriter::SerialiseScratchFastTensor(const Tensor *tensor, Address address) +{ + assert(!tensor->IsConstant()); + + const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t); + auto serialisedTensor = std::make_unique(serialisedTensorSize); + + // Initialise scratch_fast tensor header + regor_raw_tensor_header_t header; + header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_SCRATCH_FAST; + header.tensor.scratch_fast.region = SCRATCH_FAST_REGION; + header.tensor.scratch_fast.size = DataTypeStorageSizeBytes(tensor->Type(), tensor->StorageShape().Elements()); + header.tensor.scratch_fast.address = address; + + // Copy header + std::copy_n(reinterpret_cast(&header), sizeof(header), serialisedTensor.get()); + + _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize); +} + +void RawWriter::SerialiseInputTensor(const Tensor *tensor, Address address) +{ + assert(!tensor->IsConstant()); + + const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t); + auto serialisedTensor = std::make_unique(serialisedTensorSize); + + // Initialise input tensor header + regor_raw_tensor_header_t header; + header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_INPUT; + header.tensor.input.region = INPUT_REGION; + header.tensor.input.element_size = DataTypeStorageSizeBytes(tensor->Type(), 1); + auto shape = Shape::PadAxes(tensor->StorageShape(), 4, 1).ToList(); + std::copy(shape.begin(), shape.end(), header.tensor.input.shape); + header.tensor.input.size = DataTypeStorageSizeBytes(tensor->Type(), tensor->StorageShape().Elements()); + header.tensor.input.address = address; + + // Copy header + std::copy_n(reinterpret_cast(&header), sizeof(header), serialisedTensor.get()); + + _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize); +} + +void RawWriter::SerialiseOutputTensor(const Tensor *tensor, Address address) +{ + assert(!tensor->IsConstant()); + + const size_t serialisedTensorSize = sizeof(regor_raw_tensor_header_t); + auto serialisedTensor = std::make_unique(serialisedTensorSize); + + // Initialise output tensor header + regor_raw_tensor_header_t header; + header.type = regor_raw_tensor_header_t::RAW_TENSOR_TYPE_OUTPUT; + header.tensor.output.region = OUTPUT_REGION; + header.tensor.output.element_size = DataTypeStorageSizeBytes(tensor->Type(), 1); + auto shape = Shape::PadAxes(tensor->StorageShape(), 4, 1).ToList(); + std::copy(shape.begin(), shape.end(), header.tensor.input.shape); + header.tensor.output.size = DataTypeStorageSizeBytes(tensor->Type(), tensor->StorageShape().Elements()); + header.tensor.output.address = address; + + // Copy header + std::copy_n(reinterpret_cast(&header), sizeof(header), serialisedTensor.get()); + + _raw.emplace_back(std::move(serialisedTensor), serialisedTensorSize); +} + +} // namespace regor diff --git a/ethosu/regor/compiler/raw_writer.hpp b/ethosu/regor/compiler/raw_writer.hpp new file mode 100644 index 00000000..9630f104 --- /dev/null +++ b/ethosu/regor/compiler/raw_writer.hpp @@ -0,0 +1,53 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "compiler/graph.hpp" +#include "compiler/tensor.hpp" + +#include +#include +#include + +namespace regor +{ + +class RawWriter +{ +public: + std::vector, size_t>> Serialise(const std::vector> &graphs, + const std::vector> &tensor_address_maps); + +private: + std::vector, size_t>> _raw; + + void SerialiseCommandStreamTensor(const Tensor *tensor); + + void SerialiseReadOnlyTensor(const Tensor *tensor); + + void SerialiseScratchTensor(const Tensor *tensor, Address address); + + void SerialiseScratchFastTensor(const Tensor *tensor, Address address); + + void SerialiseInputTensor(const Tensor *tensor, Address address); + + void SerialiseOutputTensor(const Tensor *tensor, Address address); +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/scheduler.cpp b/ethosu/regor/compiler/scheduler.cpp new file mode 100644 index 00000000..31571307 --- /dev/null +++ b/ethosu/regor/compiler/scheduler.cpp @@ -0,0 +1,1742 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "scheduler.hpp" + +#include "common/logging.hpp" + +#include "architecture/weight_encoder.hpp" +#include "cascade_builder.hpp" +#include "common/scaling.hpp" +#include "common/vector_span.hpp" +#include "faststorage_allocator.hpp" +#include "live_range.hpp" +#include "tensor_allocator.hpp" + +#include +#include +#include + +namespace regor +{ + +constexpr int AllocationQuantum = 16; +constexpr int AlignmentQuantum = 16; + +static Shape GetShapeForFormat(const Shape &shape, TensorFormat format) +{ + if ( format == TensorFormat::NHCWB16 ) + { + return shape.With(-1, RoundAway(shape.Depth(), 16)); + } + return shape; +} + +int TensorAllocationBytes(const Shape &shape, TensorFormat format, DataType dtype) +{ + Shape storageShape = GetShapeForFormat(shape, format); + return RoundAway(DataTypeStorageSizeBytes(dtype, storageShape.Elements()), AllocationQuantum); +} + +Scheduler::Scheduler(Architecture *arch, const SchedulerOptions &options, const std::string &name, + std::vector> &ops) : + _ops(ops) +{ + assert(arch != nullptr); + _arch = arch; + _options = options; + _name = name; + _spilling = _arch->StagingMemory() != _arch->FeatureMapMemory(); +} + +std::shared_ptr Scheduler::Process() +{ + Address peakMemoryUsage = CreateSchedulerRepresentation(); + + // Create the Max schedule template + _maxSchedule = CreateInitialSchedule(); + + // TODO: Disabled until fully implemented + // MoveConstantData( _maxSchedule.get() ); + + // Create the optimised Max schedule + UpdateOpMemorySnapshot(_maxSchedule.get()); + auto optMaxSchedule = ProposeScheduleBuffering(_maxSchedule.get(), std::numeric_limits::max()); + UpdateOpMemorySnapshot(optMaxSchedule.get()); + + // Create Min schedule + auto minSchedule = ProposeMinimalSchedule(); + Address initialStagingLimit = _options.optimizationStagingLimit; + if ( _options.optimizationStrategy == OptimizationStrategy::Size ) + { + initialStagingLimit = peakMemoryUsage; + } + + // Build cascades from min schedule + std::unordered_map nonLocal; + CascadeBuilder cascadeBuilder(_ops, nonLocal, _spilling); + cascadeBuilder.BuildCascades(minSchedule.get(), _maxSchedule.get(), initialStagingLimit); + UpdateOpMemorySnapshot(minSchedule.get()); + + std::shared_ptr chosenSchedule = minSchedule; + + if ( _options.optimizationStrategy == OptimizationStrategy::Performance ) + { + // Create an optimized schedule + auto optSchedule = OptimizeSchedule(minSchedule.get(), optMaxSchedule); + UpdateOpMemorySnapshot(optSchedule.get()); + chosenSchedule = std::move(optSchedule); + } + + CoalesceWeightBufferTensors(chosenSchedule.get()); + UpdateOpMemorySnapshot(chosenSchedule.get()); + + ApplySchedule(chosenSchedule.get()); + + if ( _spilling ) + { + // Use fast storage for feature maps + FastStorageAllocator allocator; + allocator.AllocateFeatureMaps(_ops, chosenSchedule.get(), _arch->StagingMemory(), _options.optimizationStagingLimit); + } + + UpdateOpMemorySnapshot(chosenSchedule.get()); + + if ( _options.verboseSchedule ) + { + PrintSchedule(chosenSchedule.get()); + } + + AllocateAddresses(chosenSchedule.get()); + + return chosenSchedule; +} + +Point2i Scheduler::GetStripeInputRequirement(const Shape &ofmShape, Kernel *kernel, ArchResampling resampling) +{ + int rounding; + int upscale = _arch->UpscaleAndRounding(resampling, rounding); + int h = RequiredInputSize(ofmShape.Height(), kernel->Stride().y, kernel->DilatedWH().y, upscale, rounding); + int w = RequiredInputSize(ofmShape.Width(), kernel->Stride().x, kernel->DilatedWH().x, upscale, rounding); + return Point2i(w, h); +} + +// Returns true if NHWC format must be used for the given tensor +static bool CheckLinearFormatForConcatSplit(SchedulerTensor *tensor) +{ + for ( const auto &prod : tensor->producers ) + { + // If axis corresponds to C-dimension, NHCWB16 can only be used in the output if all the concat_start's + // are a multiple of 16. This as, it is only then the address offset for the ofm, for all operations, + // will be 16 byte aligned. For other values of axis the address offsets will be 16 byte aligned, as they + // are all based on c = 0 and those addresses are always 16 byte aligned due to the NHCWB16 format. + for ( auto &conn : prod->outputs ) + { + if ( conn.tensor.get() == tensor && conn.slice.offset.Size() > 0 && (conn.slice.offset.Depth() & 15) != 0 ) + { + return true; + } + } + } + for ( const auto &cons : tensor->consumers ) + { + // If read offset is not a multiple of 16 in the C-dimension, NHCWB16 need to be avoided in the input. + for ( auto &conn : cons->inputs ) + { + if ( conn.tensor.get() == tensor && conn.slice.offset.Size() > 0 && (conn.slice.offset.Depth() & 15) != 0 ) + { + return true; + } + } + } + return false; +} + + +static int UpdateSchedulerTensor(TensorUsage usage, SchedulerConnection *conn) +{ + auto tensor = conn->tensor.get(); + + // Multiple consumers/producers require the full tensor present + if ( tensor->producers.size() > 1 || tensor->consumers.size() > 1 || (IsOFM(usage) && conn->slice.offset.Size() > 0) || // Concat + tensor->isGraphInput || tensor->isGraphOutput ) + { + conn->requireFullTensor = true; + } + + // Force linear output from Reverse for C dimension because brick output from Reverse has special requirements + if ( IsOFM(usage) && conn->reverse == ReverseType::C ) + { + tensor->needsLinearFormat = true; + } + + for ( auto producer : tensor->producers ) + { + // TODO: Gather doesn't support brick format yet (MLBEDSW-8410) + if ( producer->Type() == OpType::Scatter || producer->Type() == OpType::Gather ) + { + tensor->needsLinearFormat = true; + } + if ( !producer->IsNpuOp() ) + { + tensor->hasCPUWriters = true; + } + } + + for ( auto consumer : tensor->consumers ) + { + if ( !consumer->IsNpuOp() ) + { + tensor->hasCPUReaders = true; + } + // TODO: Gather doesn't support brick format yet (MLBEDSW-8410) + if ( consumer->Type() == OpType::Scatter || consumer->Type() == OpType::Gather ) + { + tensor->needsLinearFormat = true; + } + // Int32 ReduceSum requires linear format + if ( consumer->Type() == OpType::ReduceSum && tensor->dataType == DataType::Int32 ) + { + tensor->needsLinearFormat = true; + } + // Check if consumer shape requires linear format + // Brick format can only be used if both shapes have equal W and C + auto ifm0 = consumer->TryIFM(0); + auto ifm1 = consumer->TryIFM(1); + auto ifm2 = consumer->TryIFM(2); + if ( (ifm0 && ifm0->tensor.get() == tensor && + Shape::PadAxes(ifm0->shape, 2, 1).WC() != Shape::PadAxes(conn->shape, 2, 1).WC()) || + (ifm1 && ifm1->tensor.get() == tensor && + Shape::PadAxes(ifm1->shape, 2, 1).WC() != Shape::PadAxes(conn->shape, 2, 1).WC()) || + (ifm2 && ifm2->tensor.get() == tensor && + Shape::PadAxes(ifm2->shape, 2, 1).WC() != Shape::PadAxes(conn->shape, 2, 1).WC()) ) + { + tensor->needsLinearFormat = true; + } + } + + // Initial criteria (may change) + bool cpuTensor = tensor->hasCPUWriters || tensor->hasCPUReaders || tensor->isGraphInput || tensor->isGraphOutput; + conn->requireFullTensor = conn->requireFullTensor || cpuTensor; + tensor->needsLinearFormat = tensor->needsLinearFormat || cpuTensor || CheckLinearFormatForConcatSplit(tensor); + + // Set tensor format to NHCWB16 for output FeatureMaps, if possible + if ( IsOFM(usage) ) + { + if ( !tensor->needsLinearFormat ) + { + tensor->format = TensorFormat::NHCWB16; + } + } + + return tensor->srcTensor->IsConstant() ? 0 : tensor->AllocationSizeBytes(); +} + + +Address Scheduler::CreateSchedulerRepresentation() +{ + int minMemoryRequired = 0; + + for ( auto const &schedOp : _ops ) + { + int opMemoryRequired = 0; + + for ( auto pos : schedOp->outputs.pairs() ) + { + opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second); + } + + for ( auto pos : schedOp->inputs.pairs() ) + { + opMemoryRequired += UpdateSchedulerTensor(pos.first, &pos.second); + } + + minMemoryRequired = std::max(minMemoryRequired, opMemoryRequired); + } + + return minMemoryRequired; +} + + +namespace +{ + + +ArchAccumulatorSource GetArchAccumulatorSource(const AccumulatorControl &ac) +{ + switch ( ac.source ) + { + case AccumulatorSource::Reset: + return ArchAccumulatorSource::Reset; + case AccumulatorSource::Acc: + return ArchAccumulatorSource::Acc; + case AccumulatorSource::Ifm2: + return ArchAccumulatorSource::Ifm2; + default: + return ArchAccumulatorSource::Reset; + } +} + +std::unique_ptr GetOpConfig(Architecture *arch, SchedulerOperation *op, const Shape &ifmShape, + const Shape &ifm2Shape, const Shape &ofmShape, WeightFormat wgtFormat) +{ + assert(op->IsNpuOp()); + + SchedulerConnection *ifm = op->IFM(0); + SchedulerConnection *ifm2 = op->TryIFM(1); + SchedulerConnection *ofm = op->OFM(); + + ArchitectureConfigQuery query; + query.ofmShape = Shape::PadAxes(ofmShape, 3, 1); + query.ifmShape[0] = ifmShape; + query.ifmShape[1] = ifm2Shape; + query.ifmBits = DataTypeSizeBits(ifm->tensor->dataType); + query.kernel = op->Kernel(); + query.lutBytes = op->TryInput(TensorUsage::LUT) ? 2048 : 0; + query.scaled = op->HasScaling(); + query.ifmResampling = ifm->resamplingMode; + query.ofmShape = query.ofmShape.Untranspose(ofm->transpose); + query.transpose = ofm->transpose; + query.reverse = ofm->reverse; + query.ofmFormat = ofm->tensor->format; + const auto &accMode = op->AccumulatorMode(); + query.accSource = GetArchAccumulatorSource(accMode); + query.accOutputEnabled = accMode.outputEnabled; + query.weightFormat = wgtFormat; + if ( op->Type() == OpType::Resize ) + { + query.rescaling.scaleX = op->Attributes().resize.scaleX; + query.rescaling.scaleY = op->Attributes().resize.scaleY; + } + + return arch->GetOpConfig(op->Type(), query); +} + +struct EncodingResult +{ + Flags format; + int size = std::numeric_limits::max(); + int optimalDepth = 0; +}; + +Flags ChooseBestWeightFormat(Architecture *arch, SchedulerOperation *, OptimizationStrategy strategy, + int maxStreams, std::vector &encodingResults) +{ + EncodingResult *bestResult = nullptr; + // Prefer fast decoder if weights are in fast memory. + bool preferFast = (arch->ReadonlyMemory().memory->Bandwidth() == arch->StagingMemory().memory->Bandwidth()) && (strategy != OptimizationStrategy::Size); + // Standard decoder is faster if we have 4 instances + bool avoidFast = maxStreams > 2; + // If buffering is improved, we may want to allow larger diff here + constexpr float maxFastSizeRatio{1.01f}; + constexpr float maxSparseSizeRatio{1.01f}; + for ( auto &encodingResult : encodingResults ) + { + bool isFast = (encodingResult.format & WeightFormat::Fast); + if ( !bestResult ) + { + bestResult = &encodingResult; + continue; + } + if ( (bestResult->format & WeightFormat::Fast) != isFast ) + { + if ( isFast && !avoidFast && (preferFast || encodingResult.size <= bestResult->size * maxFastSizeRatio) ) + { + bestResult = &encodingResult; + continue; + } + if ( !isFast && (avoidFast || (!preferFast && encodingResult.size <= bestResult->size * maxFastSizeRatio)) ) + { + bestResult = &encodingResult; + continue; + } + } + if ( !(bestResult->format & WeightFormat::Sparse2_4) && (encodingResult.format & WeightFormat::Sparse2_4) ) + { + if ( (encodingResult.optimalDepth <= bestResult->optimalDepth) && // Sparsity can give different block + // config + (encodingResult.size <= bestResult->size * maxSparseSizeRatio) ) + { + bestResult = &encodingResult; + } + } + } + return bestResult->format; +} + +} // namespace + +Flags Scheduler::BestWeightFormat( + SchedulerOperation *op, Shape &ifmShape, Shape &ifm2Shape, Shape &ofmShape, Flags supportedFormats) +{ + using WF = Flags; + + std::vector encodingResults; + auto weights = op->Input(TensorUsage::Weights); + auto ifm = op->IFM(op->PrimaryIfmIndex()); + auto ifmType = ifm->tensor->dataType; + int maxStreams = 0; + // WF(WeightFormat::Default) needs to be first, FWD doesn't count zeroes. + for ( auto weightFormat : {WF(WeightFormat::Default), WF(WeightFormat::Fast), + WF(WeightFormat::Default, WeightFormat::Sparse2_4), WF(WeightFormat::Fast, WeightFormat::Sparse2_4)} ) + { + if ( (weightFormat & supportedFormats) != weightFormat ) continue; + EncodingResult result{weightFormat}; + std::unique_ptr blockConfig = GetOpConfig(_arch, op, ifmShape, ifm2Shape, ofmShape, weightFormat); + result.optimalDepth = blockConfig->OptimalDepthGranule(); + WeightsRef weightsRef = {&weights->tensor->bufferView, weights->tensor->srcTensor->AxisOrder(), weights->tensor->dataType}; + + std::vector depthOffsets{0, ofmShape.Depth()}; + auto encodingParams = _arch->WeightEncoder()->GetEncodingConfig( + blockConfig.get(), weightsRef, op->Kernel(), ifmType, depthOffsets, weightFormat); + try + { + auto weightInfo = AnalyzeWeights(encodingParams.get(), weights->tensor.get(), weights->quantization); + + result.size = weightInfo.encodedSize; + if ( weightInfo.streams > maxStreams ) maxStreams = weightInfo.streams; + if ( weightFormat == WF(WeightFormat::Default) && weightInfo.zeroCount < DivRoundUp(weightInfo.sourceSize, 2) ) + { + // Can't be Sparse2_4 if less than half of the weights are zero + supportedFormats &= ~WF(WeightFormat::Sparse2_4); + } + } + catch ( const WeightEncodeException & ) + { + continue; + } + encodingResults.emplace_back(result); + } + assert(!encodingResults.empty()); + return ChooseBestWeightFormat(_arch, op, _options.optimizationStrategy, maxStreams, encodingResults); +} + +std::unique_ptr Scheduler::CreateSchedulerOpInfo(SchedulerOperation *op, const Shape &ofmStripeShape) +{ + assert(op->PrimaryIfmIndex() >= 0 && op->PrimaryIfmIndex() <= 1); + SchedulerConnection *ifm = op->IFM(op->PrimaryIfmIndex()); + SchedulerConnection *ifm2 = op->TryIFM(1 - op->PrimaryIfmIndex()); + + auto ifmShape = ifm->shape; + auto ifm2Shape = ifm2 ? ifm2->shape : Shape(); + auto ofmShape = ofmStripeShape; + + // Operations that cannot be subdivided require full OFM shape + if ( _arch->CanSubdivide(op->Type()) == AxisMask::None ) + { + ofmShape = op->OFM()->shape; + } + + // Give empty operation info to CPU ops + if ( !op->IsNpuOp() ) + { + return std::make_unique(nullptr, ifmShape, ifm2Shape, ofmShape); + } + + // Determine if striped operation + if ( ofmShape != op->OFM()->shape ) + { + // Striped Op - Need to calculate stripe input volume + Point2i stripeInput = GetStripeInputRequirement(ofmShape, op->Kernel(), ifm->resamplingMode); + + // Ensure stripe input volume is within the full IFM volume + stripeInput = Point2i::Min(stripeInput, ifmShape.WH()); + ifmShape = ifmShape.WithHW(stripeInput.y, stripeInput.x); + + if ( !ifm2Shape.IsEmpty() ) + { + Point2i stripeInput2 = Point2i::Min(stripeInput, ifm2Shape.WH()); + ifm2Shape = ifm2Shape.WithHW(stripeInput2.y, stripeInput2.x); + } + } + + auto weightFormat = _arch->SupportedWeightFormat(op->Type()); + + WeightScaleTensors weightScales; + auto weights = op->TryInput(TensorUsage::Weights); + if ( !weights || !weights->tensor->IsConstant() ) weightFormat = WeightFormat::Default; + if ( weightFormat != WeightFormat::Default ) + weightFormat = BestWeightFormat(op, ifmShape, ifm2Shape, ofmShape, weightFormat); + // Potentially repeat until weight encoding successful + std::unique_ptr blockConfig; + do + { + blockConfig = GetOpConfig(_arch, op, ifmShape, ifm2Shape, ofmShape, weightFormat); + + if ( weights == nullptr || !weights->tensor->IsConstant() ) break; + + auto scales = op->Input(TensorUsage::Scales); + + WeightsRef weightsRef = {&weights->tensor->bufferView, weights->tensor->srcTensor->AxisOrder(), weights->tensor->dataType}; + + std::vector depthOffsets{0, ofmShape.Depth()}; + + auto encodingParams = _arch->WeightEncoder()->GetEncodingConfig( + blockConfig.get(), weightsRef, op->Kernel(), ifm->tensor->dataType, depthOffsets, weightFormat); + + try + { + if ( op->OFM()->quantization.type != QuantizationType::EXPLICIT ) + { + auto temp = _arch->WeightEncoder()->MakeExplicit(ifm->quantization, weights->quantization, + op->OFM()->quantization, scales->tensor->dataType, ifm->tensor->dataType); + op->OFM()->quantization = std::move(temp); + assert(op->OFM()->quantization.type == QuantizationType::EXPLICIT); + } + + weightScales = EncodeWeightAndScaleTensor(std::move(encodingParams), weights->tensor.get(), + scales->tensor.get(), weights->quantization, op->OFM()->quantization); + break; + } + catch ( const WeightEncodeException & ) + { + if ( weightFormat & WeightFormat::Sparse2_4 ) weightFormat ^= WeightFormat::Sparse2_4; + else if ( weightFormat & WeightFormat::Fast ) weightFormat ^= WeightFormat::Fast; + else break; + } + } while ( true ); + + // Finally construct and populate operator information (cost) + auto opInfo = std::make_unique(std::move(blockConfig), ifmShape, ifm2Shape, ofmShape); + opInfo->SetWeightScaleTensors(weightScales.npuWeightsTensor, weightScales.npuScalesTensor); + + return opInfo; +} + + +std::unique_ptr Scheduler::CreateInitialSchedule() +{ + auto schedule = std::make_unique(_name + "_MAX"); + + for ( auto &op : _ops ) + { + auto cost = CreateSchedulerOpInfo(op.get(), op->OFM()->SliceShape()); + cost->cycles = EstimateOpPerformance(op.get(), cost->Config(), op->OFM()->shape.Depth()); + cost->elementAccess = EstimateOpElementAccess(op.get(), cost->Config(), op->OFM()->shape.Depth()); + schedule->SetCost(*op, std::move(cost)); + } + return schedule; +} + + +void Scheduler::MoveConstantData(Schedule *refSchedule) +{ + auto permanentStorageMemory = _arch->ReadonlyMemory(); + const bool moveConstantData = permanentStorageMemory != _arch->FeatureMapMemory(); + + // Determine if data can be moved from permanent storage to another memory area. A difference in source tensor + // and target tensor memory area will generate a DMA command in the command stream. + for ( auto &schedOp : _ops ) + { + // Ignore CPU ops + if ( !schedOp->IsNpuOp() ) + { + continue; + } + + auto cost = refSchedule->Cost(schedOp.get()); + int maxIfmShramAvail = cost->Config()->MaxIFMBuffering() / 2; + for ( auto pos : schedOp->inputs.pairs() ) + { + SchedulerConnection *conn = &pos.second; + if ( !conn->tensor->srcTensor->IsConstant() ) + { + continue; + } + + // Determine whether or not to move data from permanent storage to more suitable + // storage before use. + bool moveData = false; + if ( conn->tensor->memArea == permanentStorageMemory && moveConstantData ) + { + moveData = std::any_of(conn->tensor->consumers.begin(), conn->tensor->consumers.end(), + [](const SchedulerOperation *op) { return op->Type() != OpType::FullyConnected; }); + + // Check if broadcast elementwise can be buffered + if ( IsIFM(pos.first) && IsElementwise(schedOp->Type()) && (conn->shape != schedOp->OFM()->shape) && + conn->tensor->srcTensor->View().BufferSize() > maxIfmShramAvail ) + { + moveData = true; + } + } + + if ( moveData ) + { + // Set scheduler tensor to different memory area i.e. move from srcTensor to (scheduler) tensor + conn->tensor->memArea = _arch->StagingMemory(); + } + } + } +} + + +void Scheduler::AllocateAddresses(Schedule *schedule) +{ + const auto verbose = _options.verboseAllocation; + AllocateTensors(_ops, schedule, _arch->FeatureMapMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose); + if ( _spilling ) + { + const auto limit = _options.optimizationStagingLimit; + AllocateTensors(_ops, schedule, _arch->StagingMemory(), TensorAllocator::HillClimb, AlignmentQuantum, verbose, limit); + } +} + + +/// @brief Specialised LiveRangeGraph for read only (flash) memory which ignores scalars if possible +class ReadOnlyLiveRangeGraph : public LiveRangeGraph +{ +private: + Architecture *_arch; + +public: + ReadOnlyLiveRangeGraph(Architecture *arch) : _arch(arch) {} + bool ShouldBeIgnored(SchedulerTensor *tens, const MemArea &targetMemory) override + { + // First do the regular check for matching memory type + if ( LiveRangeGraph::ShouldBeIgnored(tens, targetMemory) ) + { + return true; + } + // Memory type correct, check if this tensor is a scalar that can be encoded + // in the command stream for this architecture + auto srcTens = tens->srcTensor; + if ( srcTens && srcTens->StorageShape().Elements() == 1 && srcTens->IsConstant() ) + { + // All consumers must accept this scalar if we are to ignore it + for ( auto op : tens->consumers ) + { + // Find usage of the tensor for this consumer op + TensorUsage usage(TensorUsage::None); + for ( auto input : op->inputs.pairs() ) + { + if ( input.second.tensor.get() == tens ) + { + usage = input.first; + } + } + if ( !_arch->SupportsScalar(op->Type(), tens->dataType, usage) ) + { // This scalar cannot be ignored and must be handled + return false; + } + } + // At this point we have determined that the tensor can be encoded + // as a scalar in the command stream and can safely be ignored + return true; + } + // Not a scalar - cannot be ignored + return false; + } +}; + + +void Scheduler::AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearAllocator &readOnlyAllocator) +{ + auto lrGraph = ReadOnlyLiveRangeGraph(_arch); + lrGraph.ExtractLiveRangesFromCascades(_ops, schedule, _arch->ReadonlyMemory(), false); + auto totalSize = readOnlyAllocator.Allocate(&lrGraph, AlignmentQuantum, _options.verboseAllocation); + schedule->memoryUsage[_arch->ReadonlyMemory()] = int(totalSize); +} + + +void Scheduler::UpdateOpMemorySnapshot(Schedule *schedule) +{ + const auto fastStorage = _arch->StagingMemory(); + auto lrGraph = LiveRangeGraph(); + lrGraph.ExtractLiveRangesFromCascades(_ops, schedule, fastStorage, true); + // Populate time-array with memory used by live ranges + std::vector temporalUsage = lrGraph.GetTemporalMemoryUsage(schedule->fastStoragePeakUsage); + schedule->memorySnapshot = std::move(temporalUsage); +} + + +std::shared_ptr Scheduler::ProposeScheduleBuffering(Schedule *refSchedule, Address stagingLimitBytes) +{ + auto bufferedSchedule = std::make_shared(refSchedule->Name() + "_BUFFERED"); + int stagingLimitClamped = int(std::min(INT64_C(1) << 30, stagingLimitBytes)); + + SchedulerOperation *prevOp = nullptr; + for ( auto const &schedOp : _ops ) + { + SchedulerOpInfo *cost = refSchedule->Cost(schedOp.get()); + // schedOp is not part of this sub-schedule - skip + if ( cost == nullptr ) + { + continue; + } + + ProposeOperatorBuffering(schedOp.get(), prevOp, bufferedSchedule.get(), refSchedule, stagingLimitClamped); + prevOp = schedOp.get(); + } + + return bufferedSchedule; +} + + +void Scheduler::ProposeOperatorBuffering(SchedulerOperation *schedOp, SchedulerOperation *prevOp, + Schedule *bufferedSchedule, Schedule *refSchedule, int stagingLimitBytes) +{ + // Mild recursion might mean this Op has already been seen + if ( bufferedSchedule->Cost(schedOp) != nullptr ) + { + return; + } + + // Take the reference schedule as default costings for this schedule + auto refCost = refSchedule->Cost(schedOp); + assert(refCost != nullptr); + auto costCopy = std::make_unique(*refCost); + auto cost = costCopy.get(); + bufferedSchedule->SetCost(*schedOp, std::move(costCopy)); + + // Don't buffer non NPU operations + if ( !schedOp->IsNpuOp() ) + { + return; + } + + int slackBufferingMemory = stagingLimitBytes - refSchedule->MemoryUsageAt(refCost->timeIndex); + cost->slackBufferingMemory = slackBufferingMemory; + cost->slackBufferingCycles = refCost->cycles.opCycles; + + // Attempt weight buffering on anything with a weights tensor + auto weights = schedOp->TryInput(TensorUsage::Weights); + if ( weights != nullptr ) + { + auto scales = schedOp->Input(TensorUsage::Scales); + ProposeWeightBuffering(weights, scales, schedOp, prevOp, bufferedSchedule, refSchedule, slackBufferingMemory); + } +} + + +void Scheduler::ProposeWeightBuffering(SchedulerConnection *weights, SchedulerConnection *scales, SchedulerOperation *schedOp, + SchedulerOperation *prevOp, Schedule *bufferedSchedule, Schedule *refSchedule, int bufferLimitBytes) +{ + constexpr int OFMSplitDepth = 16; + auto cost = bufferedSchedule->Cost(schedOp); + auto prevCost = bufferedSchedule->Cost(prevOp); + auto refCost = refSchedule->Cost(schedOp); + auto ifm = schedOp->IFM(0); + auto ofm = schedOp->OFM(); + + assert(cost && refCost); + + // Weights are in permanent storage. When permanent storage differs from feature map storage, + // there is a point moving the data + auto weightTens = weights->tensor.get(); + auto scaleTens = scales->tensor.get(); + // No need to move the weights if they are already in the same memory as the staging area + bool needsDMA = weightTens->memArea.memory != _arch->StagingMemory().memory; + + std::vector ofmFullDepthSlices = {0, refCost->stripe.Depth()}; + + WeightsRef weightsRef = {&weightTens->bufferView, weightTens->srcTensor->AxisOrder(), weightTens->dataType}; + + auto weightFormat = cost->npuWeightsTensor->config->Format(); + + auto encodingParams = _arch->WeightEncoder()->GetEncodingConfig( + cost->Config(), weightsRef, schedOp->Kernel(), ifm->tensor->dataType, ofmFullDepthSlices, weightFormat); + + auto fullWeightScales = EncodeWeightAndScaleTensor( + std::move(encodingParams), weightTens, scaleTens, weights->quantization, ofm->quantization); + + int fullWeightsBytes = fullWeightScales.npuWeightsTensor->AllocationSizeBytes(); + + // No buffering required - take all the weights from permanent storage + if ( schedOp->Type() == OpType::FullyConnected || !needsDMA || + _arch->CanSubdivide(schedOp->Type()) == AxisMask::None || schedOp->OFM()->reverse == ReverseType::C ) + { + cost->ofmDepthSlices = std::move(ofmFullDepthSlices); + cost->SetWeightScaleTensors(fullWeightScales.npuWeightsTensor, fullWeightScales.npuScalesTensor); + return; + } + + auto encodedWeightScales = fullWeightScales; + + // How many NPU cycles are available under the previously executing + // operator for performing buffered DMA transfers + int64_t slackCycles = (prevCost != nullptr) ? prevCost->slackBufferingCycles : 0; + int slackMemory = (prevCost != nullptr) ? prevCost->slackBufferingMemory : 0; + + int weightBufferSize = 0; + + // Force full depth for cascaded Ops + if ( cost->cascade != 0 ) + { + weightBufferSize = fullWeightsBytes; + // Update the memory snapshot to reflect the added size of the weights + refSchedule->memorySnapshot[cost->timeIndex] += weightBufferSize; + } + else + { + // Estimate the buffering cycle time for the full set of weights + int64_t fullTransferCycles = _arch->Performance()->MemToMemCycles( + _arch->StagingMemory().memory, weightTens->memArea.memory, fullWeightsBytes); + cost->fullWeightTransferCycles = fullTransferCycles; + + // Calculate the amount of pre-buffering necessary (or what is possible with limited + // double buffer buffer size) + double prebufferRatio = 0; + const int halfBufferLimit = bufferLimitBytes / 2; + int prebufferBytes = std::min(fullWeightsBytes, halfBufferLimit); + if ( fullTransferCycles > slackCycles ) + { + prebufferRatio = double(slackCycles) / double(fullTransferCycles); + prebufferBytes = std::min(int(prebufferRatio * fullWeightsBytes), halfBufferLimit); + } + + prebufferRatio = double(prebufferBytes) / fullWeightsBytes; + + // Have to split the weights if the initial buffering can't store + // all of the compressed weights + if ( prebufferBytes < fullWeightsBytes ) + { + int blockDepth = cost->Config()->OptimalDepthGranule(); + + // Choose initial pre-buffering depth (already buffer clamped) + int prebufferDepth = int(refCost->stripe.Depth() * prebufferRatio); + prebufferDepth = int(std::max(16, RoundZero(prebufferDepth, OFMSplitDepth))); + + // Calculate cycles executed during the pre-buffer + auto preOpCycles = EstimateOpPerformance(schedOp, cost->Config(), prebufferDepth); + int bufferingDepth = int((refCost->stripe.Depth() * preOpCycles.opCycles) / fullTransferCycles); + + // Choose initial buffering depth and clamp to the double buffering limit + bufferingDepth = RoundAway(bufferingDepth, blockDepth); + int bufferingBytes = (bufferingDepth / refCost->stripe.Depth()) * fullWeightsBytes; + if ( bufferingBytes > halfBufferLimit ) + { + bufferingDepth = (halfBufferLimit / fullWeightsBytes) * refCost->stripe.Depth(); + } + + while ( true ) + { + // Attempt to buffer whole blocks + if ( bufferingDepth > blockDepth ) + { + bufferingDepth = RoundZero(bufferingDepth, blockDepth); + } + else + { + bufferingDepth = RoundZero(bufferingDepth, OFMSplitDepth); + } + + bufferingDepth = int(std::max(bufferingDepth, OFMSplitDepth)); + + // Create list of depth slices + std::vector depthSlices = {0}; + + for ( int depth = prebufferDepth; depth < refCost->stripe.Depth(); depth += bufferingDepth ) + { + depthSlices.push_back(depth); + } + depthSlices.push_back(refCost->stripe.Depth()); + + // Encode weights based depth slices + cost->ofmDepthSlices = std::move(depthSlices); + + weightsRef = {&weightTens->bufferView, weightTens->srcTensor->AxisOrder(), weightTens->dataType, false}; + + encodingParams = _arch->WeightEncoder()->GetEncodingConfig(cost->Config(), weightsRef, + schedOp->Kernel(), ifm->tensor->dataType, cost->ofmDepthSlices, weightFormat); + + encodedWeightScales = EncodeWeightAndScaleTensor( + std::move(encodingParams), weightTens, scaleTens, weights->quantization, ofm->quantization); + + // Chosen buffering might not fit at all, iterate until it does + // or until the minimum usable slice size is reached + if ( encodedWeightScales.npuWeightsTensor->maxRangeBytes <= halfBufferLimit || prebufferDepth == OFMSplitDepth ) + { + break; + } + + // Failed to choose buffer sizes above, reduce them and try again + if ( bufferingDepth > prebufferDepth ) + { + bufferingDepth = RoundAway(bufferingDepth / 2, OFMSplitDepth); + } + else + { + prebufferDepth = RoundAway(prebufferDepth / 2, OFMSplitDepth); + } + } + + // Calculate cycles required to run the last op for use as future slack + int lastDepth = cost->ofmDepthSlices.back(); + lastDepth -= *(cost->ofmDepthSlices.rbegin() + 1); + auto tailCycles = EstimateOpPerformance(schedOp, cost->Config(), lastDepth); + cost->slackBufferingCycles = tailCycles.opCycles; + } + } + + // Determine whether the weights need to be double buffered + int encodedWeightsSize = encodedWeightScales.npuWeightsTensor->AllocationSizeBytes(); + weightBufferSize = std::min(encodedWeightsSize, encodedWeightScales.npuWeightsTensor->maxRangeBytes); + + // Only buffer weights if there's still space left for the buffer + if ( weightBufferSize <= bufferLimitBytes ) + { + assert(weightBufferSize % 16 == 0); // NOTE: vague check, leave validation until later? + + // Determine whether to double buffer or single buffer + Buffering buffering = Buffering::Single; + if ( (weightBufferSize * 2 <= bufferLimitBytes) && (weightBufferSize < encodedWeightsSize) ) + { + weightBufferSize = weightBufferSize * 2; + buffering = Buffering::Double; + } + + // Create a new tensor in fast storage to use as weights buffer + cost->bufferedWeightTensor.tensor = std::make_shared(); + cost->bufferedWeightTensor.tensor->srcTensor = encodedWeightScales.npuWeightsTensor->srcTensor; + cost->bufferedWeightTensor.tensor->allocatedSize = weightBufferSize; + cost->bufferedWeightTensor.tensor->memArea = _arch->StagingMemory(); + cost->bufferedWeightTensor.buffering = buffering; + + if ( cost->cascade == 0 ) + { + // Determine if the lifetime can be extended and pre-buffer weights under the previous operation + cost->bufferedWeightTensor.preBuffer = (weightBufferSize < slackMemory); + } + + cost->slackBufferingMemory -= weightBufferSize; + } + else + { + // Don't slice or buffer - use the whole depth from persistent storage + cost->ofmDepthSlices = std::move(ofmFullDepthSlices); + encodedWeightScales = std::move(fullWeightScales); + } + cost->SetWeightScaleTensors(encodedWeightScales.npuWeightsTensor, encodedWeightScales.npuScalesTensor); +} + + +std::shared_ptr Scheduler::ProposeMinimalSchedule() +{ + // Proposes scheduling parameters where every operator is subdivided into the smallest stripe that + // satisfies the next operators stride + auto minSchedule = std::make_shared(_name + "_MIN"); + + // Keep track of the previous Op - which consumes the current Op's OFM + SchedulerOperation *prevOp = nullptr; + + // Work backwards up the schedule setting the minimum stripe height + for ( auto pos = _ops.rbegin(); pos != _ops.rend(); pos++ ) + { + auto const &schedOp = *pos; + int minStripeHeight = (prevOp != nullptr) ? prevOp->Kernel()->Stride().y : 1; + Shape minStripe = Shape::PadAxes(schedOp->OFM()->shape, 3, 1); + minStripe[-3] = minStripeHeight; + auto cost = CreateSchedulerOpInfo(schedOp.get(), minStripe); + cost->cycles = EstimateOpPerformance(schedOp.get(), cost->Config(), schedOp->OFM()->shape.Depth()); + cost->elementAccess = EstimateOpElementAccess(schedOp.get(), cost->Config(), schedOp->OFM()->shape.Depth()); + minSchedule->SetCost(*schedOp, std::move(cost)); + + prevOp = schedOp.get(); + } + + return minSchedule; +} + + +std::shared_ptr Scheduler::OptimizeSchedule(Schedule *schedule, const std::shared_ptr &maxSchedule) +{ + // Extracts sub-schedules based on the cascades and optimizes them and applies them to the final schedule + if ( maxSchedule->fastStoragePeakUsage < _options.optimizationStagingLimit && !_spilling ) + { + return maxSchedule; + } + + // Optimize cascades separately + // Iterate over a copy of the cascades since they may change during the loop + auto cascades = schedule->cascades; + for ( const auto &pos : cascades ) + { + const CascadeInfo &cascadeInfo = pos.second; + + auto optSubSchedule = OptimizeSubSchedule(cascadeInfo, schedule, _options.optimizationStagingLimit); + if ( optSubSchedule != nullptr ) + { + // Remove the existing cascade + schedule->cascades.erase(pos.first); + // Move subschedule costs/cascades back into the schedule + SchedulerCostMap costs; + optSubSchedule->DetachCosts(costs); + schedule->UpdateCosts(costs); + schedule->UpdateCascades(optSubSchedule->cascades); + } + } + + // Update memory snapshot + UpdateOpMemorySnapshot(schedule); + + // Propose schedule buffering to the optimized schedule + auto optSchedule = ProposeScheduleBuffering(schedule, _options.optimizationStagingLimit); + optSchedule->cascades = std::move(schedule->cascades); // TODO: Check this is okay + // Copy the cascade's metadata from the unbuffered schedule + return optSchedule; +} + + +std::shared_ptr Scheduler::ProposeScheduleStriping(const Shape &finalStripe, const std::string &label, Schedule *refSchedule) +{ + // Proposes new striping for a schedule. The stripe is derived from the ifm requirements of the next Op down + auto stripedSchedule = std::make_shared(label); + + Shape stripe = finalStripe; + for ( auto pos = _ops.rbegin(); pos != _ops.rend(); pos++ ) + { + auto schedOp = pos->get(); + auto refCost = refSchedule->Cost(schedOp); + if ( !schedOp->IsNpuOp() || refCost == nullptr ) + { + // sched_op is not part of the sub-schedule - skip + continue; + } + + // Create a cost entry with the new stripe + auto cost = CreateSchedulerOpInfo(schedOp, stripe); + + // Estimate performance + cost->cycles = EstimateOpPerformance(schedOp, cost->Config(), schedOp->OFM()->shape.Depth()); + cost->elementAccess = EstimateOpElementAccess(schedOp, cost->Config(), schedOp->OFM()->shape.Depth()); + stripedSchedule->SetCost(*schedOp, std::move(cost)); + + // Calculate the preceding Op's stripe + stripe = schedOp->IFM(schedOp->PrimaryIfmIndex())->shape.With(-3, stripe.Height() * schedOp->Kernel()->Stride().y); + } + return stripedSchedule; +} + + +Address Scheduler::EstimateScheduleMemoryUsage(Schedule *schedule, const std::unordered_map &nonLocalMem) +{ + // Estimates the memory usage of a schedule + // cascades = schedule.cascades; + int peakMemUsage = 0; + for ( auto const &schedOp : _ops ) + { + auto cost = schedule->Cost(schedOp.get()); + if ( cost == nullptr ) + { + // sched_op is not part of the sub-schedule - skip + continue; + } + + if ( cost->cascade != 0 ) + { + // This Op is part of a cascade - use the cascade's memory usage + auto const &cascadeInfo = schedule->cascades.at(cost->cascade); + // Non-local memory usage is already included in the cascade_info + peakMemUsage = std::max(cascadeInfo.memUsage, peakMemUsage); + } + else + { + // This Op is not part of a cascade - calculate the memory usage + int opWeightBuffer = 0; + if ( cost->bufferedWeightTensor.tensor ) + { + opWeightBuffer = cost->bufferedWeightTensor.tensor->AllocationSizeBytes(); + } + + int opMemUsage = schedOp->IFM(0)->PartialAllocationSizeBytes() + schedOp->OFM()->PartialAllocationSizeBytes() + opWeightBuffer; + if ( nonLocalMem.find(*schedOp) != nonLocalMem.end() ) + { + opMemUsage += nonLocalMem.at(*schedOp); + } + + auto ifm1 = schedOp->TryIFM(1); + if ( ifm1 ) + { + opMemUsage += ifm1->PartialAllocationSizeBytes(); + } + + peakMemUsage = std::max(opMemUsage, peakMemUsage); + } + } + return peakMemUsage; +} + + +std::shared_ptr Scheduler::OptimizeSubSchedule(const CascadeInfo &cascadeInfo, Schedule *refSchedule, Address stagingLimitBytes) +{ + // Extracts the Ops covered by the given cascade and creates a sub-schedule. The sub-schedule is optimized by + // proposing weight buffering and then continuously proposing new stripe sizes + + // Extract the ops that are part of this sub-schedule + vector_span> subOps(_ops, cascadeInfo.start, cascadeInfo.end + 1); + + // Create a sub-schedule that contains only the costs for the Ops that are part of the sub-schedule + auto subSchedule = std::make_shared(_name + fmt::format("SUB_{}_{}", cascadeInfo.start, cascadeInfo.end)); + for ( auto &subOp : subOps ) + { + // NOTE: Copies the cost objects, consider optimising this + auto costCopy = std::make_unique(*refSchedule->Cost(subOp.get())); + subSchedule->SetCost(*subOp, std::move(costCopy)); + } + + // Update subschedule cascade list + subSchedule->cascades[cascadeInfo.end] = cascadeInfo; + + // Use the memory snapshot from the reference schedule (takes a copy) + subSchedule->memorySnapshot = refSchedule->memorySnapshot; + + SchedulerOperation *firstOp = subOps.front().get(); + + // Calculate memory usage that is live during the sub-schedule but not part of it + int timeForCascade = refSchedule->Cost(firstOp)->timeIndex; + + int memUsageParallelToSubSchedule = refSchedule->MemoryUsageAt(timeForCascade) - cascadeInfo.memUsage; + + // If the first Op's IFM has other consumers it has to live throughout the whole sub-schedule whether it's + // included in a cascade or not + int persistentInitialIFM = 0; + auto firstOpIfm = firstOp->IFM(firstOp->PrimaryIfmIndex()); + if ( firstOpIfm->tensor->consumers.size() > 1 ) + { + persistentInitialIFM = firstOpIfm->tensor->AllocationSizeBytes(); + } + + // Calculate non-local-mem-usage per Operator + std::unordered_map nonLocalMemUsage; + nonLocalMemUsage[*firstOp] = memUsageParallelToSubSchedule; + for ( int i = 1; i < subOps.size(); i++ ) + { + nonLocalMemUsage[*subOps[i]] = memUsageParallelToSubSchedule + persistentInitialIFM; + } + + CascadeBuilder cascadeBuilder(subOps, nonLocalMemUsage, _spilling); + + // Start by adding buffering + auto bufferedSubSchedule = ProposeScheduleBuffering(subSchedule.get(), _options.optimizationStagingLimit); + + // Copy the cascades over from the unbuffered-schedule + bufferedSubSchedule->cascades = subSchedule->cascades; + + // Generate the possible stripings for the final Op in the sub-schedule + Shape finalOFMShape = subOps.back()->OFM()->shape; + const int maxStripeHeight = (finalOFMShape.Height() + 1) / 2; + + std::vector possibleStripes; + possibleStripes.reserve(maxStripeHeight); + for ( int h = 1; h <= maxStripeHeight; h++ ) + { + possibleStripes.push_back(finalOFMShape.With(-3, h)); + } + + // Propose different striping - the possible stripes are proposed similarly to a binary search + std::shared_ptr bestSchedule; + int first = 0, last = int(possibleStripes.size()); + +#if LOG_TRACE1_ON + LOG_INDENT(Logging::Out); +#endif + + while ( first < last ) + { + int index = first + (last - first) / 2; + const Shape &proposedStripe = possibleStripes[index]; + + auto proposedSchedule = ProposeScheduleStriping( + proposedStripe, fmt::format("_OPT_{}", proposedStripe.Height()), bufferedSubSchedule.get()); + + cascadeBuilder.BuildCascades(proposedSchedule.get(), _maxSchedule.get(), stagingLimitBytes); + + // Check if proposal fits + Address proposedMemUsage = EstimateScheduleMemoryUsage(proposedSchedule.get(), nonLocalMemUsage); + + if ( proposedMemUsage <= stagingLimitBytes ) + { + // Ignore all possible stripes smaller than this + first = index + 1; + bestSchedule = proposedSchedule; + // No cascading required - early exit + if ( proposedSchedule->cascades.empty() ) + { + break; + } + } + else + { + // Proposal doesn't fit within the limit - ignore all possible stripes larger than this + last = index; + } + } + + return bestSchedule; +} + + +void Scheduler::ApplySchedule(Schedule *schedule) +{ + const auto idealFormat = _arch->IdealBufferingFormat(); + + // Applies the given schedule as the end result + for ( auto &schedOp : _ops ) + { + if ( !schedOp->IsNpuOp() ) + { + continue; + } + + auto cost = schedule->Cost(schedOp.get()); + if ( cost->cascade > 0 ) + { + const CascadeInfo &cascadeInfo = schedule->cascades.at(cost->cascade); + auto pos = cascadeInfo.buffers.find(*schedOp); + if ( pos != cascadeInfo.buffers.end() ) + { + auto bufferTensor = schedOp->IFM(schedOp->PrimaryIfmIndex())->tensor.get(); + // Apply memory area + bufferTensor->memArea = _arch->StagingMemory(); + // Apply rolling buffer dimensions + Shape bufferShape = pos->second.shape; + assert(!bufferTensor->needsLinearFormat); + bufferTensor->format = idealFormat; + assert(bufferShape.Width() == bufferTensor->storageShape.Width() && "Only y-striping implemented"); + bufferTensor->storageShape = bufferTensor->storageShape.WithHW(bufferShape.Height(), bufferShape.Width()); + } + } + + // Check buffering tensors are meaningfully defined + assert(!cost->bufferedWeightTensor.tensor || + (cost->bufferedWeightTensor.tensor->allocatedSize == 0 || (cost->bufferedWeightTensor.tensor->srcTensor != nullptr))); + } +} + + +// Coalesce repeated weight buffer tensors +void Scheduler::CoalesceWeightBufferTensors(Schedule *schedule) +{ + SchedulerOpInfo *prevCost = nullptr; + + for ( auto &schedOp : _ops ) + { + if ( !schedOp->IsNpuOp() ) + { + continue; + } + + auto cost = schedule->Cost(schedOp.get()); + if ( prevCost && cost ) + { + auto &prevBufTensor = prevCost->bufferedWeightTensor.tensor; + auto &bufTensor = cost->bufferedWeightTensor.tensor; + if ( prevBufTensor && bufTensor ) + { + UniqueId prevWeightsTensorId = prevCost->npuWeightsTensor ? prevCost->npuWeightsTensor->equivalenceId : -1; + UniqueId weightsTensorId = cost->npuWeightsTensor ? cost->npuWeightsTensor->equivalenceId : -2; + if ( prevWeightsTensorId == weightsTensorId && prevBufTensor->allocatedSize == bufTensor->allocatedSize && + prevCost->ofmDepthSlices.size() == 2 && cost->ofmDepthSlices.size() == 2 && prevCost->ofmDepthSlices == cost->ofmDepthSlices ) + { + // Reuse previous weight buffer tensor if both current and previous op use 1 depth slice + // This will extend the life range weight buffer tensor + bufTensor = prevBufTensor; + } + } + } + + prevCost = cost; + } +} + + +PerformanceQuery Scheduler::InitPerfQuery(SchedulerOperation *op, ArchitectureOpConfig *config, int ofmDepth = -1) +{ + PerformanceQuery query = {}; + query.type = op->Type(); + query.kernel = op->Kernel(); + query.config = config; + + SchedulerConnection *ifm0 = op->IFM(0); + query.ifmShape[0] = ifm0->shape; + query.ifmMemory[0] = ifm0->tensor->memArea.memory; + query.ifmType[0] = ifm0->tensor->dataType; + query.ifmFormat[0] = ifm0->tensor->format; + + SchedulerConnection *ifm1 = op->TryIFM(1); + if ( ifm1 ) + { + query.ifmShape[1] = ifm1->shape; + query.ifmMemory[1] = ifm1->tensor->memArea.memory; + query.ifmType[1] = ifm1->tensor->dataType; + query.ifmFormat[1] = ifm1->tensor->format; + } + + SchedulerConnection *ofm = op->OFM(); + query.ofmShape = (ofmDepth >= 0) ? ofm->shape.WithDepth(ofmDepth) : ofm->shape; + query.ofmMemory = ofm->tensor->memArea.memory; + query.ofmType = ofm->tensor->dataType; + query.ofmFormat = ofm->tensor->format; + + SchedulerConnection *scales = op->TryInput(TensorUsage::Scales); + if ( scales ) + { + query.constShape = Shape(1, 1, 1, query.ofmShape.Depth()); + query.constMemory = scales->tensor->memArea.memory; + } + + return query; +} + + +std::vector Scheduler::InitFusionQuery(SchedulerOperation *op) +{ + std::vector fused; + + for ( auto const &subOp : op->SubOps() ) + { + fused.emplace_back(); + + FusionQuery &fusedOp = fused.back(); + fusedOp.type = subOp->Type(); + fusedOp.kernel = subOp->Kernel(); + auto ifm2 = subOp->TryIFM(1); + if ( ifm2 ) + { + fusedOp.ifm2Shape = ifm2->shape; + fusedOp.ifm2Memory = ifm2->tensor->memArea.memory; + fusedOp.ifm2Type = ifm2->tensor->dataType; + fusedOp.ifm2Format = ifm2->tensor->format; + } + } + + return fused; +} + + +CycleCost Scheduler::EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth) +{ + CycleCost cycleCost; + if ( !op->IsNpuOp() ) + { + LOG_WARN("CPU performance estimation for \"{}\" not implemented\n", OpTypeToString(op->Type())); + return cycleCost; + } + + PerformanceQuery query = InitPerfQuery(op, config, ofm_depth); + std::vector fused = InitFusionQuery(op); + cycleCost = _arch->Performance()->MeasureCycleCost(query, fused); + return cycleCost; +} + +ElementAccess Scheduler::EstimateOpElementAccess(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth) +{ + ElementAccess access; + if ( !op->IsNpuOp() ) + { + LOG_WARN("CPU performance estimation for \"{}\" not implemented\n", OpTypeToString(op->Type())); + return access; + } + PerformanceQuery query = InitPerfQuery(op, config, ofm_depth); + access = _arch->Performance()->MeasureElementAccess(query); + return access; +} + +void Scheduler::PrintSchedule(Schedule *schedule) +{ + LOG_PRINT("Schedule: '{}'\n", schedule->Name()); + for ( auto const &schedOp : _ops ) + { + auto cost = schedule->Cost(schedOp.get()); + if ( cost == nullptr ) + { + continue; + } + + LOG_PRINT("\t{0}: Operation {1} - OFM {2}\n", schedOp->Index(), OpTypeToString(schedOp->Type()), + schedOp->OFM()->shape.ToString()); + LOG_PRINT("\t\tKernel: {0}\n", schedOp->Kernel()->ToString()); + + if ( !schedOp->IsNpuOp() ) + { + LOG_PRINT("\t\tCPU Operation\n"); + } + else + { + LOG_PRINT("{0}\n", cost->ToString()); + } + + int mem_usage = 0; + if ( cost->timeIndex >= 0 && cost->timeIndex < int(schedule->memorySnapshot.size()) ) + { + mem_usage = schedule->memorySnapshot[cost->timeIndex]; + } + + LOG_PRINT("\t\tEstimated Perf: Macs={0} Cycles={1}\n", cost->cycles.macs, cost->cycles.opCycles); + LOG_PRINT("\t\tMemory Used: {0} bytes\n", mem_usage); + } + + LOG_PRINT("\tCascades:\n"); + auto const &cascades = schedule->cascades; + + // Sort cascade contents by id and start time + std::vector keys; + for ( auto const &pos : cascades ) + { + keys.push_back(pos.first | (pos.second.start << 16)); + } + std::sort(keys.begin(), keys.end()); + + // Print sorted cascade indices + for ( auto key : keys ) + { + auto const &cascade = cascades.at(key & 0xFFFF); + LOG_PRINT("\t\t{0}: {1} -> {2}, size: {3}\n", key & 0xFFFF, cascade.start, cascade.end, cascade.memUsage); + } +} + + +void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader) +{ + // Parse debug settings + std::string key; + while ( reader.Begin(key) ) + { + if ( key == "optimize" ) + { + std::string value; + if ( reader.Read(value) ) + { + if ( _strnicmp(value.data(), "size", 5) == 0 ) + { + opt.optimizationStrategy = OptimizationStrategy::Size; + } + else if ( _strnicmp(value.data(), "performance", 12) == 0 ) + { + opt.optimizationStrategy = OptimizationStrategy::Performance; + } + } + } + else if ( key == "verbose" ) + { + opt.verboseSchedule = reader.Get(); + } + else if ( key == "verbose_allocation" ) + { + opt.verboseAllocation = reader.Get(); + } + else if ( key == "arena_size_limit" ) + { + opt.optimizationStagingLimit = reader.Get(); + std::string suffix; + if ( reader.Read(suffix) ) + { + if ( suffix == "kb" ) + { + opt.optimizationStagingLimit *= 1024; + } + else if ( suffix == "mb" ) + { + opt.optimizationStagingLimit *= 1024 * 1024; + } + } + } + + reader.End(); + } +} + + +struct SchedulerTransformParam : public WeightTransformParam +{ + const int64_t *zeroPoints; + int zeroCount; +}; + + +static int ApplyZeroPointIHWO(const WeightTransformParam *param, int value) +{ + const SchedulerTransformParam *p = static_cast(param); + value = (value - int(p->zeroPoints[p->o % p->zeroCount])); + assert(value >= -255 && value <= 255); + return value; +} + + +static int ApplyZeroPointOHWI(const WeightTransformParam *param, int value) +{ + const SchedulerTransformParam *p = static_cast(param); + value = (value - int(p->zeroPoints[p->i % p->zeroCount])); + assert(value >= -255 && value <= 255); + return value; +} + +WeightScaleTensors Scheduler::EncodeWeightAndScaleTensor(std::unique_ptr encodingParams, const SchedulerTensor *weightTens, + const SchedulerTensor *scaleTens, const Quantization &weightQuantization, const Quantization &ofmQuantization) +{ + bool doWeights = true; + bool doScales = true; + + // Check cache for weight tensors already encoded with this configuration. + auto cacheKey = TensorCacheKey(encodingParams.get(), weightTens->equivalenceId); + auto pos = _tensorCache.find(cacheKey); + std::shared_ptr cachedWeightsTensor; + if ( pos != _tensorCache.end() ) + { + const WeightScaleTensors &cached = pos->second; + assert(ofmQuantization.type == QuantizationType::EXPLICIT); + uint32_t scaleHash = HashVector32(ofmQuantization.scales); + // If scale tensor hashes match, return this combined weights tensor. + if ( cached.scaleHash == scaleHash ) + { + return cached; + } + // Already cached weights, but scales differ so perform scale encoding + cachedWeightsTensor = cached.npuWeightsTensor; + doWeights = false; + } + + // Attempt the encode (may fail) + WeightScaleTensors result = TryEncodeWeightAndScaleTensor( + encodingParams.get(), weightTens, scaleTens, weightQuantization, ofmQuantization, doWeights, doScales); + result.scaleHash = HashVector32(ofmQuantization.scales); + + if ( doWeights ) + { + // Weights and scales now encoded together + _tensorCache.emplace(cacheKey, result); + result.npuWeightsTensor->config = std::move(encodingParams); + } + else + { + // Going to reuse a cached tensor for weights (must alias if memory areas don't match). + if ( cachedWeightsTensor->memArea == weightTens->memArea ) + { + result.npuWeightsTensor = std::move(cachedWeightsTensor); + } + else + { + // TODO: Clone tensor (but share buffer) if mem area assignment conflicts. + // Or cache encoded buffers and always wrap in a new tensor. + assert(false); + throw WeightEncodeException{}; + } + } + + return result; +} + +WeightScaleTensors Scheduler::TryEncodeWeightAndScaleTensor(IWeightEncodingConfig *encodingParams, + const SchedulerTensor *weightTens, const SchedulerTensor *scaleTens, const Quantization &weightQuantization, + const Quantization &ofmQuantization, bool doWeights, bool doScales) +{ + // Create tensor to hold encoded output + auto npuTensor = std::make_shared(); + npuTensor->memArea = weightTens->memArea; + + int weightRangeIndex = 0; + int maxSingleBufferLen = 0; + std::vector encodedStream; + + const auto &weightView = weightTens->bufferView; + Shape wshape = weightView.ViewShape(); + auto strides = weightView.StrideBytes() * 8 / DataTypeSizeBits(weightTens->dataType); + + Shape ohwiStrides; + Shape ohwiShape; + if ( weightTens->srcTensor->AxisOrder() == AxisOrder::IHWO ) + { + ohwiStrides = strides.Extract(3, 1, 2, 0); + ohwiShape = wshape.Extract(3, 1, 2, 0); + } + else + { + ohwiStrides = std::move(strides); + ohwiShape = std::move(wshape); + } + + // Set up weight source + SchedulerTransformParam param; + param.zeroPoints = weightQuantization.zeroPoints.data(); + param.zeroCount = int(weightQuantization.zeroPoints.size()); + + auto zeroOffsetFunc = (weightTens->srcTensor->AxisOrder() == AxisOrder::IHWO) ? ApplyZeroPointIHWO : ApplyZeroPointOHWI; + auto weightSource = _arch->WeightEncoder()->GetWeightSource(encodingParams, weightTens->dataType, zeroOffsetFunc, ¶m); + auto scaleSource = _arch->WeightEncoder()->GetScaleSource(encodingParams, scaleTens->dataType, ofmQuantization); + + auto weightsData = weightView.Buffer()->Data(); + int fullOfmDepth = ohwiShape[0]; + int totalWeightBytes = 0; + int subStreams = 1; + int scaleStreamsRequired = 1; + const int streamsRequired = _arch->WeightEncoder()->StreamsRequired(encodingParams, ohwiShape, scaleStreamsRequired); + + // Note: in case of multiple cores, each core's weights are interleaved in O-dimension + auto const &depthOffsets = encodingParams->DepthOffsets(); + const int nrDepthOffsets = int(depthOffsets.size()); + for ( int idx = 0; idx < nrDepthOffsets - 1; ++idx ) + { + int depthOffset = depthOffsets[idx]; + + // Do not generate for offsets outside the OFM + assert(depthOffset >= 0 && depthOffset < fullOfmDepth); + int depthLength = depthOffsets[idx + 1] - depthOffset; + + int bufferStartOffset = int(encodedStream.size()); + + // For each stream, deinterleave weights/scales from the larger volume + // and generate separate compressed streams. + for ( int stream = 0; stream < streamsRequired; ++stream ) + { + int key = WeightKey(stream, depthOffset); + WeightRange range; + range.offset = int(encodedStream.size()); + range.index = weightRangeIndex++; + + if ( doScales && stream < scaleStreamsRequired ) + { + // Encode Scales and biases + auto biases = scaleTens->bufferView.Values(); + scaleSource->SetSource(&biases[0], scaleTens->bufferView.ViewShape().Depth(), depthOffset, depthLength, stream); + if ( scaleSource->Elements() == 0 ) + { + // No more elements left to encode + continue; + } + range.scaleBytes = _arch->WeightEncoder()->EncodeScales(encodingParams, scaleSource.get(), encodedStream, false); + + // Align to 16 for start of next substream + while ( encodedStream.size() % 16 != 0 ) + { + encodedStream.push_back(0); + } + } + + if ( doWeights ) + { + range.weightOffset = int(encodedStream.size()) - range.offset; + + // Encode Weights + ohwiShape[0] = depthLength; + weightSource->SetSource(weightsData, depthOffset, ohwiShape, ohwiStrides, stream); + int len = + _arch->WeightEncoder()->EncodeWeights(encodingParams, weightSource.get(), encodedStream, false).encodedSize; + range.weightBytes = len; + totalWeightBytes += len; + } + + assert(encodedStream.size() % 16 == 0); + npuTensor->encodedRanges[key] = range; + subStreams = std::max(stream + 1, subStreams); + } + + // Remember maximum encoded length for DoubleBuffering + maxSingleBufferLen = std::max(maxSingleBufferLen, int(encodedStream.size()) - bufferStartOffset); + } + + // Reduce stored memory usage as much as possible + encodedStream.shrink_to_fit(); + + auto encodedTensor = std::make_shared(doWeights ? weightTens->Name() : scaleTens->Name(), DataType::UInt8); + int streamSize = int(encodedStream.size()); + auto buf = std::make_shared(std::move(encodedStream)); + encodedTensor->SetStorageShape(Shape(1, 1, 1, streamSize)); + encodedTensor->SetBuffer(buf); + + npuTensor->srcTensor = encodedTensor; + npuTensor->maxRangeBytes = maxSingleBufferLen; + npuTensor->totalWeightBytes = totalWeightBytes; + npuTensor->subStreams = subStreams; + npuTensor->storageShape = encodedTensor->StorageShape(); + npuTensor->allocatedSize = encodedTensor->View().BufferSize(); + + WeightScaleTensors result; + result.scaleHash = HashVector32(ofmQuantization.scales); + + if ( doWeights ) + { + result.npuWeightsTensor = std::move(npuTensor); + result.npuScalesTensor = nullptr; + } + else + { + // Only scales encoded + assert(doScales); + result.npuScalesTensor = std::move(npuTensor); + } + + return result; +} + +WeightsInfo Scheduler::AnalyzeWeights(IWeightEncodingConfig *encodingParams, const SchedulerTensor *weightTens, const Quantization &weightQuantization) +{ + + WeightsInfo result; + std::vector encodedStream; + + const auto &weightView = weightTens->bufferView; + Shape wshape = weightView.ViewShape(); + auto strides = weightView.StrideBytes() * 8 / DataTypeSizeBits(weightTens->dataType); + + Shape ohwiStrides; + Shape ohwiShape; + if ( weightTens->srcTensor->AxisOrder() == AxisOrder::IHWO ) + { + ohwiStrides = strides.Extract(3, 1, 2, 0); + ohwiShape = wshape.Extract(3, 1, 2, 0); + } + else + { + ohwiStrides = std::move(strides); + ohwiShape = std::move(wshape); + } + + // Set up weight source + SchedulerTransformParam param; + param.zeroPoints = weightQuantization.zeroPoints.data(); + param.zeroCount = int(weightQuantization.zeroPoints.size()); + + auto zeroOffsetFunc = (weightTens->srcTensor->AxisOrder() == AxisOrder::IHWO) ? ApplyZeroPointIHWO : ApplyZeroPointOHWI; + auto weightSource = _arch->WeightEncoder()->GetWeightSource(encodingParams, weightTens->dataType, zeroOffsetFunc, ¶m); + + auto weightsData = weightView.Buffer()->Data(); + int fullOfmDepth = ohwiShape[0]; + int subStreams = 1; + int scaleStreamsRequired = 1; + const int streamsRequired = _arch->WeightEncoder()->StreamsRequired(encodingParams, ohwiShape, scaleStreamsRequired); + // Note: in case of multiple cores, each core's weights are interleaved in O-dimension + auto const &depthOffsets = encodingParams->DepthOffsets(); + const int nrDepthOffsets = int(depthOffsets.size()); + for ( int idx = 0; idx < nrDepthOffsets - 1; ++idx ) + { + int depthOffset = depthOffsets[idx]; + + // Do not generate for offsets outside the OFM + assert(depthOffset >= 0 && depthOffset < fullOfmDepth); + int depthLength = depthOffsets[idx + 1] - depthOffset; + + int bufferStartOffset = int(encodedStream.size()); + + // For each stream, deinterleave weights/scales from the larger volume + // and generate separate compressed streams. + for ( int stream = 0; stream < streamsRequired; ++stream ) + { + int key = WeightKey(stream, depthOffset); + // Encode Weights + ohwiShape[0] = depthLength; + weightSource->SetSource(weightsData, depthOffset, ohwiShape, ohwiStrides, stream); + auto weightInfo = _arch->WeightEncoder()->EncodeWeights(encodingParams, weightSource.get(), encodedStream, true); + result.sourceSize += weightInfo.sourceSize; + result.encodedSize += weightInfo.encodedSize; + result.zeroCount += weightInfo.zeroCount; + } + } + result.streams = streamsRequired; + return result; +} +} // namespace regor diff --git a/ethosu/regor/compiler/scheduler.hpp b/ethosu/regor/compiler/scheduler.hpp new file mode 100644 index 00000000..f028cb42 --- /dev/null +++ b/ethosu/regor/compiler/scheduler.hpp @@ -0,0 +1,340 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "architecture/architecture.hpp" +#include "architecture/weight_encoder.hpp" +#include "cascade_builder.hpp" +#include "common/shape.hpp" +#include "graph.hpp" +#include "quantization.hpp" +#include "scheduler_operation.hpp" + +#include +#include +#include +#include +#include + +namespace regor +{ + +class IncrementalLinearAllocator; + +enum class OptimizationStrategy +{ + Size, + Performance, +}; + +/// +/// Scheduling options +/// +struct SchedulerOptions +{ + OptimizationStrategy optimizationStrategy = OptimizationStrategy::Size; + Address optimizationStagingLimit = 0; + bool verboseSchedule = false; + bool verboseAllocation = false; +}; + +/// +/// Metadata for each scheduled operation (unique per schedule) +/// +class SchedulerOpInfo +{ +private: + std::unique_ptr _config; + +public: + Shape stripeInput[2]; + Shape stripe; + int cascade = 0; + int timeIndex = -1; + int weightSize = 0; + std::vector ofmDepthSlices; + int64_t slackBufferingCycles = 0; + int slackBufferingMemory = 0; + int64_t fullWeightTransferCycles = 0; + // Encoded weights in readonly memory + std::shared_ptr npuWeightsTensor; + // Encoded scales in readonly memory + std::shared_ptr npuScalesTensor; + // Buffered weights/scales in fast storage + SchedulerConnection bufferedWeightTensor; + CycleCost cycles; + ElementAccess elementAccess; + +public: + SchedulerOpInfo(std::unique_ptr opConfig, const Shape &stripeInput1, const Shape &stripeInput2, const Shape &stripe_) + { + this->_config = std::move(opConfig); + this->stripeInput[0] = stripeInput1; + this->stripeInput[1] = stripeInput2; + this->stripe = stripe_; + this->ofmDepthSlices = {0, stripe_.Depth()}; + } + + SchedulerOpInfo(const SchedulerOpInfo &other) { Copy(other); } + + const SchedulerOpInfo &operator=(const SchedulerOpInfo &other) + { + Copy(other); + return *this; + } + + void SetWeightScaleTensors(const std::shared_ptr &weights, const std::shared_ptr &scales) + { + npuWeightsTensor = weights; + npuScalesTensor = scales; + } + + ArchitectureOpConfig *Config() const { return _config.get(); } + + std::string ToString() const + { + std::string temp = fmt::format("\t\tTime index = {0}\n", this->timeIndex); + temp += fmt::format( + "\t\tOperator Config = {0}\n" + "\t\tIFM Stripe = [{1}]\n" + "\t\tIFM2 Stripe = [{2}]\n" + "\t\tOFM Stripe = [{3}]\n", + _config ? _config->ToString(false) : "no config", stripeInput[0].ToString(), stripeInput[1].ToString(), + stripe.ToString()); + + temp += fmt::format("\t\tAssigned Cascade = {0}", this->cascade); + + if ( npuWeightsTensor ) + { + // TODO: Finish formatting; + temp += fmt::format( + "\n\t\tEncoded Weights = {0} bytes\n" + "\t\tWeight buffer = {1} bytes\n" + "\t\tDepth slices = [{2}]", + npuWeightsTensor->AllocationSizeBytes(), + bufferedWeightTensor.tensor ? bufferedWeightTensor.tensor->AllocationSizeBytes() : 0, fmt::join(ofmDepthSlices, ", ")); + } + + return temp; + } + +private: + void Copy(const SchedulerOpInfo &other) + { + if ( other._config ) + { + // Must duplicate (can't be auto-generated) + _config = other._config->Clone(); + } + + // Potentially generatable + stripeInput[0] = other.stripeInput[0]; + stripeInput[1] = other.stripeInput[1]; + stripe = other.stripe; + cascade = other.cascade; + timeIndex = other.timeIndex; + weightSize = other.weightSize; + ofmDepthSlices = other.ofmDepthSlices; + slackBufferingCycles = other.slackBufferingCycles; + slackBufferingMemory = other.slackBufferingMemory; + fullWeightTransferCycles = other.fullWeightTransferCycles; + npuWeightsTensor = other.npuWeightsTensor; + npuScalesTensor = other.npuScalesTensor; + bufferedWeightTensor = other.bufferedWeightTensor; + cycles = other.cycles; + elementAccess = other.elementAccess; + } +}; + + +using SchedulerCostMap = std::unordered_map>; + +/// +/// Individual schedule +/// +class Schedule +{ +private: + std::string _name; + SchedulerCostMap _costMap; + +public: + std::unordered_map cascades; + std::vector memorySnapshot; + int fastStoragePeakUsage = 0; + std::unordered_map memoryUsage; + +public: + Schedule(const std::string &name) : _name(name) {} + + const std::string &Name() const { return _name; } + + void SetCost(UniqueId id, std::unique_ptr opInfo) { _costMap[id] = std::move(opInfo); } + + SchedulerOpInfo *Cost(const SchedulerOperation *op) const { return op ? Cost(*op) : nullptr; } + SchedulerOpInfo *Cost(UniqueId id) const + { + auto pos = _costMap.find(id); + return (pos != _costMap.end()) ? pos->second.get() : nullptr; + } + + const SchedulerCostMap &Costs() const { return _costMap; } + + int MemoryUsageAt(int timeIndex) const + { + return (timeIndex < int(memorySnapshot.size())) ? memorySnapshot[timeIndex] : 0; + } + + void DetachCosts(SchedulerCostMap &costs) { costs = std::move(_costMap); } + + void UpdateCosts(SchedulerCostMap &costs) + { + for ( auto &pos : costs ) + { + _costMap[pos.first] = std::move(pos.second); + } + } + + void UpdateCascades(const std::unordered_map &other) + { + cascades.insert(other.begin(), other.end()); + } + + const CascadeInfo *Cascade(int cascade) const + { + auto it = cascades.find(cascade); + return it == cascades.end() ? nullptr : &it->second; + } +}; + + +/// +/// Executable scheduling implementation +/// +class Scheduler +{ + struct TensorCacheKey + { + public: + IWeightEncodingConfig *_config; // must persist as map entry + UniqueId _uid; + + public: + TensorCacheKey(IWeightEncodingConfig *config, UniqueId uid) : _config(config), _uid(uid) {} + + bool operator==(const TensorCacheKey &other) const + { + return _config->Equals(other._config) && _uid == other._uid; + } + }; + + struct TensorCacheHash + { + std::size_t operator()(const TensorCacheKey &key) const + { + return key._config->Hash() + 37 * std::uintptr_t(key._uid); + } + }; + +private: + Architecture *_arch = nullptr; + SchedulerOptions _options; + std::string _name; + std::vector> &_ops; + std::shared_ptr _maxSchedule; + int _minMemoryRequired = 0; + bool _spilling = false; + std::unordered_map _tensorCache; + +public: + Scheduler(Architecture *arch, const SchedulerOptions &options, const std::string &name, + std::vector> &ops); + +public: + std::shared_ptr Process(); + + static std::unique_ptr ToGraph(std::vector> &ops, + std::unordered_map &tensorAddressMap, const Graph *srcGraph); + + void AllocateReadOnlyAddresses(Schedule *schedule, IncrementalLinearAllocator &readOnlyAllocator); + + static PerformanceQuery InitPerfQuery(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth); + static std::vector InitFusionQuery(SchedulerOperation *op); + +private: + Address CreateSchedulerRepresentation(); + + Point2i GetStripeInputRequirement(const Shape &ofmShape, Kernel *kernel, ArchResampling resampling); + + std::unique_ptr CreateSchedulerOpInfo(SchedulerOperation *op, const Shape &ofmStripeShape); + + std::unique_ptr CreateInitialSchedule(); + + void MoveConstantData(Schedule *refSchedule); + + void AllocateAddresses(Schedule *schedule); + + void UpdateOpMemorySnapshot(Schedule *schedule); + + std::shared_ptr ProposeScheduleBuffering(Schedule *refSchedule, Address stagingLimitBytes); + + void ProposeOperatorBuffering(SchedulerOperation *schedOp, SchedulerOperation *prevOp, Schedule *bufferedSchedule, + Schedule *refSchedule, int stagingLimitBytes); + + void ProposeWeightBuffering(SchedulerConnection *weights, SchedulerConnection *scales, SchedulerOperation *schedOp, + SchedulerOperation *prevOp, Schedule *bufferedSchedule, Schedule *refSchedule, int bufferLimitBytes); + + std::shared_ptr ProposeMinimalSchedule(); + + std::shared_ptr OptimizeSchedule(Schedule *schedule, const std::shared_ptr &maxSchedule); + + std::shared_ptr ProposeScheduleStriping(const Shape &finalStripe, const std::string &label, Schedule *refSchedule); + + Address EstimateScheduleMemoryUsage(Schedule *schedule, const std::unordered_map &nonLocalMem); + + std::shared_ptr OptimizeSubSchedule(const CascadeInfo &cascadeInfo, Schedule *refSchedule, Address stagingLimitBytes); + + void ApplySchedule(Schedule *schedule); + + void CoalesceWeightBufferTensors(Schedule *schedule); + + CycleCost EstimateOpPerformance(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth); + + ElementAccess EstimateOpElementAccess(SchedulerOperation *op, ArchitectureOpConfig *config, int ofm_depth); + + void PrintSchedule(Schedule *schedule); + + WeightScaleTensors EncodeWeightAndScaleTensor(std::unique_ptr encodingParams, const SchedulerTensor *weightTens, + const SchedulerTensor *scaleTens, const Quantization &weightQuantization, const Quantization &ofmQuantization); + + WeightScaleTensors TryEncodeWeightAndScaleTensor(IWeightEncodingConfig *encodingParams, + const SchedulerTensor *weightTens, const SchedulerTensor *scaleTens, const Quantization &weightQuantization, + const Quantization &ofmQuantization, bool doWeights, bool doScales); + + WeightsInfo AnalyzeWeights(IWeightEncodingConfig *encodingParams, const SchedulerTensor *weightTens, const Quantization &weightQuantization); + + Flags BestWeightFormat( + SchedulerOperation *op, Shape &ifmShape, Shape &ifm2Shape, Shape &ofmShape, Flags weightFormat); +}; + +void ParseSchedulerOptions(SchedulerOptions &opt, IniReader &reader); + +} // namespace regor diff --git a/ethosu/regor/compiler/scheduler_decompose.cpp b/ethosu/regor/compiler/scheduler_decompose.cpp new file mode 100644 index 00000000..5e880f95 --- /dev/null +++ b/ethosu/regor/compiler/scheduler_decompose.cpp @@ -0,0 +1,326 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "scheduler_decompose.hpp" + +#include "operation_util.hpp" + +#include +#include + +namespace regor +{ + +bool NeedsDecompose(Architecture *arch, const SchedulerOperation *schedOp) +{ + return CanDecompose(arch, schedOp) && !CanRunOnHardware(arch, schedOp); +} + +static std::unique_ptr MakeSubOperation(const SchedulerOperation *schedOp, const Kernel *newKernel = nullptr) +{ + assert(schedOp->SubOps().empty()); + assert(schedOp->Parent() == nullptr); + auto subOp = std::make_unique(schedOp->Type()); + subOp->SetKernel(newKernel ? newKernel : schedOp->Kernel()); + subOp->SetHasScaling(schedOp->HasScaling()); + subOp->_srcKey = schedOp->_srcKey; + subOp->SetPrimaryIfmIndex(schedOp->PrimaryIfmIndex()); + subOp->SetParameters(schedOp->Parameters()); + subOp->SetRounding(schedOp->Rounding()); + subOp->SetAccumulatorMode(schedOp->AccumulatorMode()); + for ( const auto *list : {&schedOp->inputs, &schedOp->outputs} ) + { + for ( const auto &item : list->pairs() ) + { + auto usage = item.first; + const auto &connection = item.second; + if ( IsOFM(usage) ) + { + connection.tensor->producers.push_back(subOp.get()); + *subOp->AddOutput(usage) = connection; + } + else + { + connection.tensor->consumers.push_back(subOp.get()); + *subOp->AddInput(usage) = connection; + } + } + } + return subOp; +} + +static auto GetArchAccumulatorSource(const AccumulatorControl &ac) +{ + switch ( ac.source ) + { + case AccumulatorSource::Reset: + return ArchAccumulatorSource::Reset; + case AccumulatorSource::Acc: + return ArchAccumulatorSource::Acc; + case AccumulatorSource::Ifm2: + return ArchAccumulatorSource::Ifm2; + default: + assert(false); + return ArchAccumulatorSource::Reset; + } +} + +bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp) +{ + regor::ArchitectureOpGroupQuery qOpGroup{}; + auto *ifm = schedOp->TryIFM(0); + auto *ifm2 = schedOp->TryIFM(1); + auto *ofm = schedOp->TryOFM(); + + if ( !ifm || !ofm ) return false; + qOpGroup.type = schedOp->Type(); + qOpGroup.kernel = schedOp->Kernel(); + qOpGroup.ifm.key = ifm->tensor->uid; + qOpGroup.ifm.type = ifm->tensor->dataType; + if ( ifm2 ) + { + qOpGroup.ifm2.key = ifm2->tensor->uid; + qOpGroup.ifm2.type = ifm2->tensor->dataType; + } + qOpGroup.ofm.key = ofm->tensor->uid; + qOpGroup.ofm.type = ofm->tensor->dataType; + if ( arch->CreateOpGroup(qOpGroup) == nullptr ) return false; + regor::ArchitectureConfigQuery qConfig; + qConfig.ofmShape = Shape::PadAxes(ofm->SliceShape(), 3, 1); + qConfig.ifmShape[0] = ifm->SliceShape(); + if ( ifm2 ) + { + qConfig.ifmShape[1] = ifm2->SliceShape(); + } + qConfig.ifmBits = DataTypeSizeBits(ifm->tensor->dataType); + qConfig.kernel = schedOp->Kernel(); + qConfig.lutBytes = schedOp->TryInput(TensorUsage::LUT) ? 2048 : 0; + qConfig.scaled = schedOp->HasScaling(); + qConfig.ifmResampling = ifm->resamplingMode; + qConfig.ofmShape = qConfig.ofmShape.Untranspose(ofm->transpose); + qConfig.transpose = ofm->transpose; + qConfig.ofmFormat = ofm->tensor->format; + const auto &accMode = schedOp->AccumulatorMode(); + qConfig.accSource = GetArchAccumulatorSource(accMode); + qConfig.accOutputEnabled = accMode.outputEnabled; + return arch->GetOpConfig(schedOp->Type(), qConfig) != nullptr; +} + +bool CanDecompose(Architecture *, const SchedulerOperation *schedOp) +{ + // TODO: MLBEDSW-8868 Restore fused transpose/reverse + if ( auto ofm = schedOp->TryOFM(); ofm && (ofm->transpose != TransposeType::None || ofm->reverse != ReverseType::None) ) + return false; + if ( schedOp->Type() == OpType::Conv2D ) return true; + if ( schedOp->Type() == OpType::DepthwiseConv2DBias ) return true; + return false; +} + +using DecomposeFunc = std::vector> (*)(Architecture *, std::unique_ptr); + +// Decompose to sub-operations with size 1 along the leading axes. +// Used for the batch dimension, and for the leading N-3 dimensions for elementwise operations. +static std::vector> DecomposeLeadingDimensions( + int dimensions, Architecture *arch, std::unique_ptr op, DecomposeFunc doDecompose) +{ + std::vector> result; + int axis = --dimensions; + auto *ofmConn = op->Output(TensorUsage::OFM); + auto *ifmConn = op->Input(TensorUsage::IFM0); + auto *ifm2Conn = op->TryInput(TensorUsage::IFM1); + auto newIfmSlice = ifmConn->slice; + auto newOfmSlice = ofmConn->slice; + newIfmSlice.shape[axis] = 1; + newOfmSlice.shape[axis] = 1; + TensorSlice newIfm2Slice; + if ( ifm2Conn != nullptr ) + { + newIfm2Slice = ifm2Conn->slice; + newIfm2Slice.shape[axis] = 1; + } + auto dimSize = ofmConn->shape[axis]; + for ( int i = 0; i < dimSize; i++ ) + { + std::unique_ptr subOp = MakeSubOperation(op.get()); + newIfmSlice.offset[axis] = i; + newOfmSlice.offset[axis] = i; + if ( ifm2Conn != nullptr ) + { + newIfm2Slice.offset[axis] = i; + } + subOp->Input(TensorUsage::IFM)->slice = newIfmSlice; + subOp->Output(TensorUsage::OFM)->slice = newOfmSlice; + if ( ifm2Conn != nullptr ) + { + subOp->Input(TensorUsage::IFM1)->slice = newIfm2Slice; + } + auto subOps = (dimensions > 0) ? DecomposeLeadingDimensions(dimensions, arch, std::move(subOp), doDecompose) : doDecompose(arch, std::move(subOp)); + result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end())); + } + return result; +} + +// Handle dilation by decomposing to suboperations with input stride = dilation and dilation 1 +static std::vector> +HandleDilation(Architecture *arch, std::unique_ptr op, DecomposeFunc doDecompose) +{ + std::vector> result; + auto *ofmConn = op->Output(TensorUsage::OFM); + auto *ifmConn = op->Input(TensorUsage::IFM); + auto *kernel = op->Kernel(); + auto &dilation = kernel->Dilation(); + auto &stride = kernel->Stride(); + auto GY = std::gcd(dilation.y, stride.y); + auto GX = std::gcd(dilation.x, stride.x); + auto DY = dilation.y / GY; + auto DX = dilation.x / GX; + for ( auto dy = 0; dy < DY; ++dy ) + { + for ( auto dx = 0; dx < DX; ++dx ) + { + auto newIfmSlice = ifmConn->slice; + auto newOfmSlice = ofmConn->slice; + auto ifmStrides = ifmConn->stepXY; + auto ofmStrides = ofmConn->stepXY; + newIfmSlice.offset[1] += dy * GY; + newIfmSlice.offset[2] += dx * GX; + ifmStrides.y *= DY * GY; + ifmStrides.x *= DX * GX; + newOfmSlice.offset[1] += dy; + newOfmSlice.offset[2] += dx; + newOfmSlice.shape[1] -= dy; + newOfmSlice.shape[2] -= dx; + ofmStrides.y *= DY; + ofmStrides.x *= DX; + auto newKernel = kernel->WithDilation({1, 1}).WithStride(stride / Point2i{GX, GY}); + std::unique_ptr subOp = MakeSubOperation(op.get(), &newKernel); + auto *subIfmConn = subOp->Input(TensorUsage::IFM); + subIfmConn->slice = std::move(newIfmSlice); + subIfmConn->stepXY = ifmStrides; + auto *subOfmConn = subOp->Output(TensorUsage::OFM); + subOfmConn->slice = std::move(newOfmSlice); + subOfmConn->stepXY = ofmStrides; + auto subOps = doDecompose(arch, std::move(subOp)); + result.insert(result.end(), std::make_move_iterator(subOps.begin()), std::make_move_iterator(subOps.end())); + } + } + return result; +} + +// Negative ifm offsets indicate new padding values with ifm offset 0 +static void UpdatePaddingIfOffsetNegative(SchedulerOperation *op) +{ + auto &ifmSlice = op->Input(TensorUsage::IFM)->slice; + if ( ifmSlice.offset.Height() < 0 || ifmSlice.offset.Width() < 0 ) + { + auto *kernel = op->Kernel(); + auto &padding = kernel->Padding(); + auto topPad = std::max(0, -ifmSlice.offset.Height()); + auto leftPad = std::max(0, -ifmSlice.offset.Width()); + auto newPadding = Margin(topPad, leftPad, padding.Bottom(), padding.Right()); + ifmSlice.offset[1] = std::max(0, ifmSlice.offset.Height()); + ifmSlice.offset[2] = std::max(0, ifmSlice.offset.Width()); + auto newKernel = kernel->WithPadding(newPadding); + op->SetKernel(&newKernel); + } +} + +std::vector> DecomposeConv2D(Architecture *arch, std::unique_ptr op) +{ + std::vector> result; + auto *ofmConn = op->Output(TensorUsage::OFM); + auto *ifmConn = op->Input(TensorUsage::IFM); + const auto &ofmShape = ofmConn->shape; + const auto &ifmShape = ifmConn->shape; + auto &ofmSlice = ofmConn->slice; + auto &ifmSlice = ifmConn->slice; + auto *kernel = op->Kernel(); + auto &padding = kernel->Padding(); + if ( !ofmSlice.offset.IsValid() ) + { + ofmSlice.offset = ofmShape.WithZeros(); + ifmSlice.offset = ifmShape.WithZeros().WithHW(-padding.Top(), -padding.Left()); + } + if ( ofmShape.Batch() > 1 ) + { + return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeConv2D); + } + if ( CanRunOnHardware(arch, op.get()) ) + { + UpdatePaddingIfOffsetNegative(op.get()); + result.emplace_back(std::move(op)); + return result; + } + auto &dilation = kernel->Dilation(); + if ( dilation.x > 1 || dilation.y > 1 ) + { + return HandleDilation(arch, std::move(op), DecomposeConv2D); + } + // TODO: MLBEDSW-8783 Decompose convolutions with large stride + // If we get here, decomposition has failed, the resulting operations will be executed on CPU + result.emplace_back(std::move(op)); + return result; +} + +std::vector> DecomposeDepthwiseConv2D(Architecture *arch, std::unique_ptr op) +{ + std::vector> result; + auto *ofmConn = op->Output(TensorUsage::OFM); + auto *ifmConn = op->Input(TensorUsage::IFM); + auto *weightsConn = op->Input(TensorUsage::Weights); + const auto &ofmShape = ofmConn->shape; + const auto &ifmShape = ifmConn->shape; + const auto &weightsShape = weightsConn->shape; + auto &ofmSlice = ofmConn->slice; + auto &ifmSlice = ifmConn->slice; + auto *kernel = op->Kernel(); + auto &padding = kernel->Padding(); + if ( !ofmSlice.offset.IsValid() ) + { + ofmSlice.offset = ofmShape.WithZeros(); + ifmSlice.offset = ifmShape.WithZeros().WithHW(-padding.Top(), -padding.Left()); + } + if ( ofmShape.Batch() > 1 ) + { + return DecomposeLeadingDimensions(1, arch, std::move(op), DecomposeDepthwiseConv2D); + } + if ( weightsShape.Depth() > 1 ) + { + // TODO: MLBEDSW-8789 Handle depthwise convolution with depth multiplier > 1 + // If we get here, decomposition has failed, the resulting operations will be executed on CPU + result.emplace_back(std::move(op)); + return result; + } + if ( CanRunOnHardware(arch, op.get()) ) + { + UpdatePaddingIfOffsetNegative(op.get()); + result.emplace_back(std::move(op)); + return result; + } + auto &dilation = kernel->Dilation(); + if ( dilation.x > 1 || dilation.y > 1 ) + { + return HandleDilation(arch, std::move(op), DecomposeDepthwiseConv2D); + } + // TODO: MLBEDSW-8783 Decompose convolutions with large stride + // If we get here, decomposition has failed, the resulting operations will be executed on CPU + result.emplace_back(std::move(op)); + return result; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/scheduler_decompose.hpp b/ethosu/regor/compiler/scheduler_decompose.hpp new file mode 100644 index 00000000..970c05f7 --- /dev/null +++ b/ethosu/regor/compiler/scheduler_decompose.hpp @@ -0,0 +1,36 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "graph.hpp" +#include "operation.hpp" +#include "scheduler_operation.hpp" + +#include + +namespace regor +{ +bool NeedsDecompose(Architecture *arch, const SchedulerOperation *schedOp); +bool CanRunOnHardware(Architecture *arch, const SchedulerOperation *schedOp); +bool CanDecompose(Architecture *arch, const SchedulerOperation *schedOp); +std::vector> DecomposeConv2D(Architecture *arch, std::unique_ptr op); +std::vector> DecomposeDepthwiseConv2D(Architecture *arch, std::unique_ptr op); +std::vector> DecomposeElementwise(Architecture *arch, std::unique_ptr op); + +} // namespace regor diff --git a/ethosu/regor/compiler/scheduler_operation.hpp b/ethosu/regor/compiler/scheduler_operation.hpp new file mode 100644 index 00000000..3e663905 --- /dev/null +++ b/ethosu/regor/compiler/scheduler_operation.hpp @@ -0,0 +1,305 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "common/ordered_map.hpp" +#include "kernel.hpp" +#include "operation.hpp" +#include "tensor.hpp" + +#include + +namespace regor +{ + +class SchedulerOperation; + +int TensorAllocationBytes(const Shape &shape, TensorFormat format, DataType dtype); + +/// +/// Scheduler's metadata for graph tensors. +/// +struct SchedulerTensor +{ +public: + std::shared_ptr srcTensor; + TensorFormat format = TensorFormat::Unknown; + MemArea memArea; + Shape storageShape; + BufferView bufferView; + DataType dataType; + bool hasCPUReaders = false; + bool hasCPUWriters = false; + bool isGraphInput = false; + bool isGraphOutput = false; + int allocatedSize = -1; + Address allocatedAddress = -1; + bool needsLinearFormat = false; + // If two tensors have same equivalence id and same memory area, they can be stored on the same address + UniqueId equivalenceId = GenerateUniqueId(); + UniqueId uid = ~0u; // Packing must initialise + std::vector producers; + std::vector consumers; + + int AllocationSizeBytes() const + { + return (allocatedSize > 0) ? allocatedSize : TensorAllocationBytes(storageShape, format, dataType); + } + std::string Name() const { return srcTensor.get() == nullptr ? "?" : srcTensor->Name(); } + bool IsConstant() const { return bufferView.HasBuffer() && bufferView.BufferSize() > 0; } +}; + + +enum class Buffering +{ + None, + Single, + Double, +}; + + +/// +/// Scheduler's metadata for tensor connections. This data is not shared +/// between operators working on the same tensor +/// +struct SchedulerConnection +{ + std::shared_ptr tensor; + Shape shape; + TensorSlice slice; + Point2i stepXY{1, 1}; + Quantization quantization; + ArchResampling resamplingMode = ArchResampling::None; + TransposeType transpose = TransposeType::None; + ReverseType reverse = ReverseType::None; + bool requireFullTensor = false; + bool preBuffer = false; + Buffering buffering = Buffering::None; + + int PartialAllocationSizeBytes() const { return TensorAllocationBytes(shape, tensor->format, tensor->dataType); } + const Shape &SliceShape() const { return slice.shape.IsEmpty() ? shape : slice.shape; } +}; + +enum class AccumulatorSource +{ + Reset = 0, + Acc = 1, + Ifm2 = 2 +}; + +struct AccumulatorControl +{ + AccumulatorSource source = AccumulatorSource::Reset; + bool outputEnabled = true; +}; + +/// +/// Scheduler's representation of executable operations +/// +class SchedulerOperation +{ + friend class SchedulerPacking; + friend class Scheduler; + +public: + OpType _type; + int _index = -1; // Execution index + std::unique_ptr _kernel; + bool _npuOp = false; + bool _hasScaling = false; + void *_srcKey = nullptr; + int _primaryIfmIndex = 0; + OpTypeParameters _parameters; + Operation::Attributes _attributes; + RoundMode _rounding = RoundMode::DBL; + AccumulatorControl _accumulatorControl; + DynamicRef _attr; + const class SchedulerOperation *_parent = nullptr; + std::vector> _subOps; // (activations, or Ethos-U85 chained ops) + ordered_map inputs; + ordered_map outputs; + std::unique_ptr _opGroup; + int _opGroupKey = 0; + +private: + UniqueId _uid; + +public: + SchedulerOperation(OpType opType) : _type(opType), _parameters({}) { _uid = GenerateUniqueId(); } + ~SchedulerOperation() { Disconnect(); } + +public: + OpType Type() const { return _type; } + int Index() const { return _index; } + + UniqueId Uid() const { return _uid; } + operator UniqueId() const { return _uid; } + + bool IsNpuOp() const { return _npuOp; } + void SetNpuOp(bool npuOp) { _npuOp = npuOp; } + + class Kernel *Kernel() const { return _kernel.get(); } + void SetKernel(const class Kernel *kernel) { _kernel = std::make_unique(*kernel); } + + bool HasScaling() const { return _hasScaling; } + void SetHasScaling(bool hasScaling) { _hasScaling = hasScaling; } + + RoundMode Rounding() const { return _rounding; } + void SetRounding(RoundMode rounding) { _rounding = rounding; } + + const AccumulatorControl &AccumulatorMode() const { return _accumulatorControl; } + void SetAccumulatorMode(const AccumulatorControl &accumulatorControl) { _accumulatorControl = accumulatorControl; } + + const class SchedulerOperation *Parent() const { return _parent; } + void SetParent(const class SchedulerOperation *parent) { _parent = parent; } + + int PrimaryIfmIndex() const { return _primaryIfmIndex; } + void SetPrimaryIfmIndex(int index) { _primaryIfmIndex = index; } + + const OpTypeParameters &Parameters() const { return _parameters; } + OpTypeParameters &Parameters() { return _parameters; } + void SetParameters(OpTypeParameters parameters) { _parameters = parameters; } + + const Operation::Attributes &Attributes() const { return _attributes; } + Operation::Attributes &Attributes() { return _attributes; } + void SetAttributes(Operation::Attributes attributes) { _attributes = attributes; } + + template + TYPE *Attribute() + { + if ( _attr && _attr.Info()->Hash() == TypeHash::HASH ) + { + return static_cast(_attr.Instance()); + } + return nullptr; + } + + // Input connections + SchedulerConnection *AddInput(TensorUsage usage) { return &inputs[usage]; } + + const SchedulerConnection *TryInput(TensorUsage usage) const { return inputs.try_ref(usage); } + SchedulerConnection *TryInput(TensorUsage usage) { return inputs.try_ref(usage); } + SchedulerConnection *Input(TensorUsage usage) { return &inputs.at(usage); } + const SchedulerConnection *Input(TensorUsage usage) const { return &inputs.at(usage); } + + SchedulerConnection *TryIFM(int index) { return inputs.try_ref(MakeTensorUsage(TensorUsage::IFM, index)); } + const SchedulerConnection *TryIFM(int index) const + { + return inputs.try_ref(MakeTensorUsage(TensorUsage::IFM, index)); + } + SchedulerConnection *IFM(int index) { return &inputs.at(MakeTensorUsage(TensorUsage::IFM, index)); } + const SchedulerConnection *IFM(int index) const { return &inputs.at(MakeTensorUsage(TensorUsage::IFM, index)); } + + // Output connections + SchedulerConnection *AddOutput(TensorUsage usage) { return &outputs[usage]; } + + SchedulerConnection *TryOutput(TensorUsage usage) { return outputs.try_ref(usage); } + SchedulerConnection *Output(TensorUsage usage) { return &outputs.at(usage); } + const SchedulerConnection *Output(TensorUsage usage) const { return &outputs.at(usage); } + + SchedulerConnection *TryOFM() { return outputs.try_ref(TensorUsage::OFM); } + const SchedulerConnection *TryOFM() const { return outputs.try_ref(TensorUsage::OFM); } + SchedulerConnection *OFM() { return &outputs.at(TensorUsage::OFM); } + const SchedulerConnection *OFM() const { return &outputs.at(TensorUsage::OFM); } + + void AddSubOp(std::unique_ptr subOp) { _subOps.push_back(std::move(subOp)); } + + const std::vector> &SubOps() const { return _subOps; } + + // Returns connections for which live range calculation is needed + std::vector> LiveRangeTensors() const + { + std::vector> liveTensors; + for ( const auto *list : {&inputs, &outputs} ) + { + for ( const auto &item : list->pairs() ) + { + auto usage = item.first & TensorUsage::TypeMask; + if ( usage == TensorUsage::IFM || usage == TensorUsage::OFM || usage == TensorUsage::LUT ) + { + liveTensors.push_back(std::make_pair(item.first, item.second.tensor.get())); + } + } + } + // TODO: intermediates + return liveTensors; + } + + void SetOpGroup(std::unique_ptr &&opGroup) { _opGroup = std::move(opGroup); } + + void SetOpGroupKey(int opGroupKey) { _opGroupKey = opGroupKey; } + + bool IsReordering() const + { + if ( !IsNone(OFM()->transpose) ) + { + return true; + } + + if ( OFM()->reverse != ReverseType::None ) + { + return true; + } + + for ( const auto &op : _subOps ) + { + if ( op->IsReordering() ) + { + return true; + } + } + + return false; + } + + void Disconnect() + { + for ( const auto *list : {&inputs, &outputs} ) + { + for ( const auto &item : list->pairs() ) + { + auto usage = item.first; + const auto &connection = item.second; + auto &vec = IsOFM(usage) ? connection.tensor->producers : connection.tensor->consumers; + vec.erase(std::remove(vec.begin(), vec.end(), this), vec.end()); + } + } + inputs.clear(); + outputs.clear(); + } + + bool IsDisconnected() const { return inputs.empty() && outputs.empty(); } +}; + +/// +/// NPU-Operation +/// +class NPUOperation +{ +private: + std::vector> _ops; + +public: + const std::vector> &Operations() const { return _ops; }; + void AddOperation(std::unique_ptr op) { _ops.push_back(std::move(op)); } +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/scheduler_packing.cpp b/ethosu/regor/compiler/scheduler_packing.cpp new file mode 100644 index 00000000..44fba907 --- /dev/null +++ b/ethosu/regor/compiler/scheduler_packing.cpp @@ -0,0 +1,493 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "scheduler_packing.hpp" + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "common/shape.hpp" +#include "graph.hpp" +#include "operation.hpp" +#include "scheduler_decompose.hpp" +#include "scheduler_operation.hpp" +#include "tensor.hpp" + +#include + +namespace regor +{ + +namespace +{ + +// Returns true if no IFMs, or if all IFMs are graph inputs +bool AllInputsAreGraphInputs(const SchedulerOperation &op) +{ + for ( const auto &schedConn : op.inputs ) + { + if ( !schedConn.tensor->IsConstant() && !schedConn.tensor->isGraphInput ) return false; + } + return true; +} + +// Returns true if no OFMs, or if all OFMs are graph outputs +bool AllOutputsAreGraphOutputs(const SchedulerOperation &op) +{ + for ( const auto &outputSchedConn : op.outputs ) + { + if ( !outputSchedConn.tensor->isGraphOutput ) return false; + } + return true; +} + +// Returns true if any of first's OFMs are same as second's IFMs +bool IsConnected(const SchedulerOperation &first, const SchedulerOperation &second) +{ + for ( const auto &firstOutputSchedConn : first.outputs ) + { + for ( const auto &secondInputSchedConn : second.inputs ) + { + if ( firstOutputSchedConn.tensor == secondInputSchedConn.tensor ) return true; + } + } + return false; +} + +} // namespace + +SchedulerPacking::SchedulerPacking(Architecture *arch) : _arch(arch) +{ +} + +std::vector> SchedulerPacking::Process(const Graph *graph) +{ + // Get operation list in execution order + std::vector executionList; + Graph::TraverseGraphFromEnd(graph->Outputs(), + [&](Operation *op) -> bool + { + executionList.push_back(op); + return true; + }); + + FilterOperations(executionList, graph); + + PackOperations(); + + ReorderOperations(); + + return std::move(_schedList); +} + +void SchedulerPacking::FilterOperations(const std::vector &executionList, const Graph *graph) +{ + // Convert linear Graph Operations to a list of Scheduler Operations + for ( Operation *op : executionList ) + { + auto schedOp = MakeSchedulerOperation(op, graph); + if ( NeedsDecompose(_arch, schedOp.get()) ) + { + auto schedOps = DecomposeSchedulerOperation(std::move(schedOp)); + _schedList.insert( + _schedList.end(), std::make_move_iterator(schedOps.begin()), std::make_move_iterator(schedOps.end())); + } + else + { + _schedList.push_back(std::move(schedOp)); + } + } +} + +void SchedulerPacking::SchedulerPacking::PackOperations() +{ + LOG_TRACE1("Scheduler Packing (of {0} Ops)\n", _schedList.size()); + + auto cur = _schedList.begin(); + auto write = cur; + + while ( cur != _schedList.end() ) + { + SchedulerOperation *primaryOp = cur->get(); + + // Compact the list as we go + if ( std::distance(write, cur) >= 1 ) + { + *write = std::move(*cur); + } + primaryOp->_index = int(std::distance(_schedList.begin(), write)); + + cur++; + + LOG_TRACE1("Creating new group with {}\n", OpTypeToString(primaryOp->Type())); + + ArchitectureOpGroupQuery op0{}; + op0.type = primaryOp->Type(); + op0.kernel = primaryOp->Kernel(); + op0.ifm.key = primaryOp->IFM(0)->tensor->uid; + op0.ifm.type = primaryOp->IFM(0)->tensor->dataType; + if ( primaryOp->TryIFM(1) ) + { + op0.ifm2.key = primaryOp->IFM(1)->tensor->uid; + op0.ifm2.type = primaryOp->IFM(1)->tensor->dataType; + } + op0.ofm.key = primaryOp->OFM()->tensor->uid; + op0.ofm.type = primaryOp->OFM()->tensor->dataType; + + // Try to create OpGroup + auto group = _arch->CreateOpGroup(op0); + + // OpGroup is nullptr if op can't run on NPU + if ( group ) + { + primaryOp->SetNpuOp(true); + + // First op in group has key 0 + int prevOpKey = 0; + primaryOp->SetOpGroupKey(prevOpKey); + LOG_TRACE1("Created new group with {} (key {})\n", OpTypeToString(primaryOp->Type()), prevOpKey); + + // Root SchedulerOperation takes ownership of the ArchitectureOpGroup here + primaryOp->SetOpGroup(std::move(group)); + + // Pack any future ops that will fit + auto prevOp = primaryOp; + + // Try chaining subsequent ops into the primary + while ( cur != _schedList.end() ) + { + SchedulerOperation *nextOp = cur->get(); + assert(nextOp); // Empty op may be possible if we seek ahead + + int key = CanPack(primaryOp, prevOp, nextOp, prevOpKey); + if ( !key ) + { + LOG_TRACE1("Can't add next op\n"); + break; + } + nextOp->SetNpuOp(true); + nextOp->SetParent(primaryOp); + nextOp->SetOpGroupKey(key); + + LOG_TRACE1("Added {} (key {}) to {} (key {})\n", OpTypeToString(nextOp->Type()), key, + OpTypeToString(prevOp->Type()), prevOpKey); + + // Replace primary op's OFM by nextOp's OFM + auto *ofmConn = primaryOp->OFM(); + ofmConn->tensor = nextOp->OFM()->tensor; + if ( IsActivation(nextOp->Type()) ) + { + ofmConn->quantization = prevOp->Output(TensorUsage::OFM)->quantization; + ofmConn->quantization.quantMin = nextOp->Output(TensorUsage::OFM)->quantization.quantMin; + ofmConn->quantization.quantMax = nextOp->Output(TensorUsage::OFM)->quantization.quantMax; + } + // Add nextOp's LUT to primary Op + auto lutConn = nextOp->TryInput(TensorUsage::LUT); + if ( lutConn != nullptr ) + { + primaryOp->AddInput(TensorUsage::LUT)->tensor = lutConn->tensor; + } + + prevOpKey = key; + prevOp = nextOp; + primaryOp->AddSubOp(std::move(*cur)); + cur++; + } + + LOG_TRACE1("\t{0}: {1} - OFM [{2}] <- (IFM0 [{3}], IFM1 [{4}], Primary={5})\n", primaryOp->Index(), + OpTypeToString(primaryOp->Type()), primaryOp->OFM()->shape.ToString(), primaryOp->IFM(0)->shape.ToString(), + primaryOp->IFM(1) ? primaryOp->IFM(1)->shape.ToString() : "", primaryOp->PrimaryIfmIndex()); + } + write++; + } + + // Shorten list to contain only those operators written + _schedList.erase(write, cur); +} + +// Reorder CPU ops so that there are fewer groups of consecutive CPU ops in the list of ops +void SchedulerPacking::ReorderOperations() +{ + // Graphs with both CPU and NPU ops might not have an optimal order in the ops list due to how the graph is + // traversed (depth first search). This can result in more context switching between CPU and NPU. Try to optimise + // this by moving/grouping CPU ops where that is possible. Criteria for CPU pass to be moved: + // + // 1) CPU passes that only consumes graph input tensors can be moved to the top of the list. + // + // 2) CPU passes that only produces graph output tensors can be moved to the bottom of the list. + // + // 3) A CPU pass X is allowed to be grouped together with CPU pass Y if there is no NPU pass between pass X and pass + // Y that depends on output from pass X. Criteria 3 will try to move as many CPU passes towards the bottom of the + // list. + + // Ops with only graph input IFMs + std::vector> earlyOps; + + // Ops with only graph output OFMs + std::vector> lateOps; + + // Ops not in the above two lists + std::vector> otherOps; + + // Reserving space since most ops are likely to end up here + otherOps.reserve(_schedList.size()); + + // Iterate in execution order to find CPU ops with only graph input IFMs or only graph output OFMs + for ( auto i = _schedList.begin(); i != _schedList.end(); ++i ) + { + std::unique_ptr &op = *i; + + if ( !op->IsNpuOp() && AllInputsAreGraphInputs(*op) ) + { + earlyOps.push_back(std::move(*i)); + } + else if ( !op->IsNpuOp() && AllOutputsAreGraphOutputs(*op) ) + { + lateOps.push_back(std::move(*i)); + } + else + { + otherOps.push_back(std::move(*i)); + } + } + + // Iterate in reverse execution order to find CPU ops + for ( auto i = otherOps.rbegin(); i != otherOps.rend(); ++i ) + { + std::unique_ptr &op = *i; + + // We're looking for CPU ops + if ( op->IsNpuOp() ) continue; + + // Iterate in execution order from the CPU op's position + for ( auto j = i; j != otherOps.rbegin(); --j ) + { + std::unique_ptr &op0 = *(j - 0); // Earlier in execution order + std::unique_ptr &op1 = *(j - 1); // Later in execution order + assert(!op0->IsNpuOp()); + + // Don't move past another CPU op + if ( !op1->IsNpuOp() ) break; + + // If our CPU op and the op after are connected, we can't move it down + if ( IsConnected(*op0, *op1) ) break; + + // Move our CPU op one step later in execution order + std::iter_swap(j, j - 1); + } + } + + // Reassemble the list + _schedList.clear(); + _schedList.reserve(earlyOps.size() + otherOps.size() + lateOps.size()); + _schedList.insert(_schedList.end(), std::make_move_iterator(earlyOps.begin()), std::make_move_iterator(earlyOps.end())); + _schedList.insert(_schedList.end(), std::make_move_iterator(otherOps.begin()), std::make_move_iterator(otherOps.end())); + _schedList.insert(_schedList.end(), std::make_move_iterator(lateOps.begin()), std::make_move_iterator(lateOps.end())); + + // Recalculate the op index now when the list may have a different order + for ( auto i = _schedList.begin(); i != _schedList.end(); ++i ) + { + (*i)->_index = int(std::distance(_schedList.begin(), i)); + } +} + +int SchedulerPacking::CanPack(const SchedulerOperation *schedOp, const SchedulerOperation *prevOp, + const SchedulerOperation *nextOp, const int prevOpKey) const +{ + const auto prevConnOfm = prevOp->OFM(); + const auto nextConnIfm = nextOp->IFM(0); + const auto nextConnIfm2 = nextOp->TryIFM(1); + const auto nextConnOfm = nextOp->OFM(); + + SchedulerTensor *prevOFM = prevConnOfm->tensor.get(); + SchedulerTensor *ifmTensor = nextConnIfm->tensor.get(); + SchedulerTensor *ifm2Tensor = nextConnIfm2 ? nextConnIfm2->tensor.get() : nullptr; + assert(prevOFM && "primary/prev op must have OFM"); + assert(ifmTensor && "next op must have IFM"); + + // Previous op in execution order doesn't connect to this one + if ( prevOFM != ifmTensor && prevOFM != ifm2Tensor ) + { + return 0; + } + + // Highly unlikely constant tensor between ops + assert(!prevOFM->srcTensor->IsConstant() && "Unexpected constant tensor between ops"); + + // Only pack tensors on single-reader/writer paths (i.e. can't pack across concat/split) + if ( prevOFM->producers.size() != 1 || prevOFM->consumers.size() != 1 ) + { + return 0; + } + + if ( schedOp->OFM()->tensor->isGraphOutput ) + { + return 0; + } + + if ( IsActivation(nextOp->Type()) && !nextConnIfm->quantization.EqualScales(nextConnOfm->quantization) ) + { + // Can not fuse activation with different scales + return 0; + } + + ArchitectureOpGroupQuery op1{}; + op1.type = nextOp->Type(); + op1.kernel = nextOp->Kernel(); + op1.ifm.key = nextConnIfm->tensor->uid; + op1.ifm.type = nextConnIfm->tensor->dataType; + if ( nextConnIfm2 ) + { + op1.ifm2.key = nextConnIfm2->tensor->uid; + op1.ifm2.type = nextConnIfm2->tensor->dataType; + } + op1.ofm.key = nextConnOfm->tensor->uid; + op1.ofm.type = nextConnOfm->tensor->dataType; + + return schedOp->_opGroup->Add(op1, {prevOpKey}); +} + +void SchedulerPacking::InitSchedulerConnection( + SchedulerConnection *schedConn, const std::shared_ptr &tensor, const TensorConnection &conn) +{ + schedConn->tensor = tensor; + schedConn->slice = conn.slice; + schedConn->shape = Shape::PadAxes(conn.shape, 3, 1); // Scheduler needs minimum HWC axes to stripe + schedConn->quantization = conn.quantization; + schedConn->transpose = conn.transpose; + schedConn->reverse = conn.reverse; +} + +void SchedulerPacking::InitSchedulerTensor(SchedulerTensor *schedTensor, Tensor *tensor, const Graph *graph) +{ + // Take scheduler-local copies of graph tensor parameters. + schedTensor->format = TensorFormat::NHWC; + schedTensor->memArea = tensor->IsConstant() ? _arch->ReadonlyMemory() : _arch->FeatureMapMemory(); + schedTensor->storageShape = Shape::PadAxes(tensor->StorageShape(), 4, 1); + schedTensor->dataType = tensor->Type(); + schedTensor->bufferView = tensor->View(); + schedTensor->isGraphInput = graph->IsInput(tensor); + schedTensor->isGraphOutput = graph->IsOutput(tensor); + schedTensor->uid = tensor->Uid(); +} + +std::unique_ptr SchedulerPacking::MakeSchedulerOperation(Operation *op, const Graph *graph) +{ + assert(op->Type() != OpType::None); + + std::unique_ptr schedOp = std::make_unique(op->Type()); + + schedOp->SetKernel(op->Kernel()); + schedOp->SetHasScaling(op->HasScaling()); + schedOp->SetRounding(op->Rounding()); + schedOp->SetParameters(op->Parameters()); + schedOp->SetAttributes(op->attr); + schedOp->_attr = op->AttributeRef(); + schedOp->_srcKey = op; + + // Get the inputs from the source op and connect with scheduler specific tensor + for ( const auto *list : {&op->Inputs(), &op->Outputs()} ) + { + for ( const auto &item : list->pairs() ) + { + Tensor *tensor = item.second.tensor.get(); + + // Get/update scheduler's metadata for the graph tensor. + auto pos = _tensorMap.find(tensor); + if ( pos == _tensorMap.end() ) + { + // Create new scheduler tensor if metadata is missing. + auto tmp = std::make_shared(); + pos = _tensorMap.emplace(tensor, tmp).first; + tmp->srcTensor = item.second.tensor; + InitSchedulerTensor(tmp.get(), tensor, graph); + } + + // Update consumers and manage connectivity + const std::shared_ptr &schedTensor = pos->second; + + if ( IsOFM(item.first) ) + { + schedTensor->producers.push_back(schedOp.get()); + } + else + { + schedTensor->consumers.push_back(schedOp.get()); + } + SchedulerConnection *schedConn = IsOFM(item.first) ? schedOp->AddOutput(item.first) : schedOp->AddInput(item.first); + InitSchedulerConnection(schedConn, schedTensor, item.second); + schedConn->resamplingMode = ResamplingMode(item.first, schedOp->Type()); + } + } + + // Examine elementwise and set a primary path for cascading. + if ( IsBinaryElementwise(op->Type()) ) + { + auto ifm0 = op->Input(TensorUsage::IFM0); + auto ifm1 = op->Input(TensorUsage::IFM1); + auto ofm = op->Output(TensorUsage::OFM); + assert(ifm0->shape.Size() > 0 && "IFM0 must have dimension"); + assert(ifm1->shape.Size() > 0 && "IFM1 must have dimension"); + // Choose the non-const IFM path for binary operations that have + // a constant input on the first IFM + if ( ifm0->tensor->IsConstant() && !ifm1->tensor->IsConstant() ) + { + schedOp->SetPrimaryIfmIndex(1); + } + // Favour the non-broadcast shape for cascading. + else if ( (ifm0->shape != ofm->shape) && (ifm1->shape == ofm->shape) ) + { + schedOp->SetPrimaryIfmIndex(1); + } + } + + return schedOp; +} + +std::vector> SchedulerPacking::DecomposeSchedulerOperation(std::unique_ptr op) +{ + std::vector> result; + switch ( op->Type() ) + { + case OpType::Conv2D: + result = DecomposeConv2D(_arch, std::move(op)); + break; + case OpType::DepthwiseConv2DBias: + result = DecomposeDepthwiseConv2D(_arch, std::move(op)); + break; + default: + assert(false); + break; + } + return result; +} + +ArchResampling SchedulerPacking::ResamplingMode(TensorUsage usage, OpType opType) const +{ + // only set resampling-mode on IFM-connections + if ( IsIFM(usage) ) + { + if ( opType == OpType::Conv2DBackpropInputSwitchedBias ) + { + return ArchResampling::Zeros; + } + } + return ArchResampling::None; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/scheduler_packing.hpp b/ethosu/regor/compiler/scheduler_packing.hpp new file mode 100644 index 00000000..84123e4c --- /dev/null +++ b/ethosu/regor/compiler/scheduler_packing.hpp @@ -0,0 +1,70 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "common/shape.hpp" +#include "graph.hpp" +#include "operation.hpp" +#include "scheduler_operation.hpp" +#include "tensor.hpp" + +#include +#include +#include +#include +#include +#include + +namespace regor +{ + +/// +/// Pack graph-level operations into flat/linear SchedulerOperation objects +/// (NOTE: Was pass_packing, but we skip the Pass to pack directly into SchedulerOperation) +/// +class SchedulerPacking +{ +protected: + Architecture *_arch = nullptr; + std::vector> _schedList; + std::unordered_map> _tensorMap; + +public: + SchedulerPacking(Architecture *arch); + +public: + std::vector> Process(const Graph *graph); + +private: + void FilterOperations(const std::vector &executionList, const Graph *graph); + void PackOperations(); + void ReorderOperations(); + + int CanPack(const SchedulerOperation *schedOp, const SchedulerOperation *prevOp, const SchedulerOperation *op, const int prevOpKey) const; + void InitSchedulerConnection(SchedulerConnection *schedConn, const std::shared_ptr &tensor, const TensorConnection &conn); + void InitSchedulerTensor(SchedulerTensor *schedTensor, Tensor *tensor, const Graph *graph); + std::unique_ptr MakeSchedulerOperation(Operation *op, const Graph *graph); + std::vector> DecomposeSchedulerOperation(std::unique_ptr op); + ArchResampling ResamplingMode(TensorUsage usage, OpType opType) const; +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/softmax.cpp b/ethosu/regor/compiler/softmax.cpp new file mode 100644 index 00000000..313ff9d9 --- /dev/null +++ b/ethosu/regor/compiler/softmax.cpp @@ -0,0 +1,530 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "compiler/softmax.hpp" + +#include "common/numeric_util.hpp" +#include "common/scaling.hpp" +#include "operation.hpp" +#include "operation_util.hpp" + +#include +#include +#include + +namespace regor +{ + +/*** Exp LUT table for int16 Softmax */ +static const uint32_t EXP_LUT[] = { + // clang-format off + 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, + 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, + 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, 0x00000002, + 0x00000002, 0x00000002, 0x00010002, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, + 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, 0x00000003, + 0x00000003, 0x00000003, 0x00000003, 0x00010003, 0x00000004, 0x00000004, 0x00000004, 0x00000004, + 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, 0x00000004, + 0x00010004, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, 0x00000005, + 0x00000005, 0x00000005, 0x00010005, 0x00000006, 0x00000006, 0x00000006, 0x00000006, 0x00000006, + 0x00000006, 0x00000006, 0x00010006, 0x00000007, 0x00000007, 0x00000007, 0x00000007, 0x00000007, + 0x00000007, 0x00000007, 0x00010007, 0x00000008, 0x00000008, 0x00000008, 0x00000008, 0x00000008, + 0x00010008, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00000009, 0x00010009, 0x0000000a, + 0x0000000a, 0x0000000a, 0x0000000a, 0x0001000a, 0x0000000b, 0x0000000b, 0x0000000b, 0x0000000b, + 0x0001000b, 0x0000000c, 0x0000000c, 0x0000000c, 0x0001000c, 0x0000000d, 0x0000000d, 0x0000000d, + 0x0001000d, 0x0000000e, 0x0000000e, 0x0000000e, 0x0001000e, 0x0000000f, 0x0000000f, 0x0001000f, + 0x00000010, 0x00000010, 0x00010010, 0x00000011, 0x00000011, 0x00010011, 0x00000012, 0x00000012, + 0x00010012, 0x00000013, 0x00000013, 0x00010013, 0x00000014, 0x00010014, 0x00000015, 0x00000015, + 0x00010015, 0x00000016, 0x00010016, 0x00000017, 0x00010017, 0x00000018, 0x00010018, 0x00000019, + 0x00010019, 0x0000001a, 0x0001001a, 0x0000001b, 0x0001001b, 0x0000001c, 0x0001001c, 0x0000001d, + 0x0001001d, 0x0000001e, 0x0001001e, 0x0001001f, 0x00000020, 0x00010020, 0x00010021, 0x00000022, + 0x00010022, 0x00010023, 0x00000024, 0x00010024, 0x00000025, 0x00010025, 0x00010026, 0x00010027, + 0x00000028, 0x00020028, 0x0000002a, 0x0001002a, 0x0001002b, 0x0001002c, 0x0000002d, 0x0001002d, + 0x0001002e, 0x0001002f, 0x00010030, 0x00010031, 0x00010032, 0x00010033, 0x00010034, 0x00010035, + 0x00010036, 0x00010037, 0x00010038, 0x00020039, 0x0001003b, 0x0000003c, 0x0002003c, 0x0001003e, + 0x0002003f, 0x00000041, 0x00020041, 0x00010043, 0x00010044, 0x00020045, 0x00020047, 0x00010049, + 0x0001004a, 0x0002004b, 0x0001004d, 0x0002004e, 0x00010050, 0x00020051, 0x00020053, 0x00010055, + 0x00020056, 0x00020058, 0x0002005a, 0x0001005c, 0x0002005d, 0x0002005f, 0x00020061, 0x00020063, + 0x00020065, 0x00020067, 0x00020069, 0x0002006b, 0x0003006d, 0x00020070, 0x00020072, 0x00020074, + 0x00030076, 0x00020079, 0x0003007b, 0x0002007e, 0x00030080, 0x00020083, 0x00020085, 0x00040087, + 0x0002008b, 0x0003008d, 0x00030090, 0x00020093, 0x00030095, 0x00030098, 0x0003009b, 0x0004009e, + 0x000300a2, 0x000300a5, 0x000300a8, 0x000300ab, 0x000400ae, 0x000300b2, 0x000400b5, 0x000400b9, + 0x000300bd, 0x000400c0, 0x000400c4, 0x000400c8, 0x000400cc, 0x000400d0, 0x000500d4, 0x000400d9, + 0x000400dd, 0x000500e1, 0x000400e6, 0x000500ea, 0x000400ef, 0x000500f3, 0x000500f8, 0x000500fd, + 0x00050102, 0x00050107, 0x0005010c, 0x00060111, 0x00050117, 0x0006011c, 0x00060122, 0x00060128, + 0x0006012e, 0x00060134, 0x0006013a, 0x00070140, 0x00060147, 0x0007014d, 0x00060154, 0x0007015a, + 0x00070161, 0x00060168, 0x0008016e, 0x00070176, 0x0008017d, 0x00080185, 0x0007018d, 0x00090194, + 0x0008019d, 0x000801a5, 0x000801ad, 0x000901b5, 0x000901be, 0x000901c7, 0x000901d0, 0x000901d9, + 0x000a01e2, 0x000901ec, 0x000a01f5, 0x000b01ff, 0x000a020a, 0x000b0214, 0x000a021f, 0x000b0229, + 0x000b0234, 0x000b023f, 0x000c024a, 0x000c0256, 0x000c0262, 0x000c026e, 0x000c027a, 0x000d0286, + 0x000d0293, 0x000d02a0, 0x000e02ad, 0x000e02bb, 0x000e02c9, 0x000e02d7, 0x000f02e5, 0x000f02f4, + 0x000f0303, 0x000f0312, 0x00100321, 0x00100331, 0x00110341, 0x00100352, 0x00120362, 0x00110374, + 0x00120385, 0x00120397, 0x001203a9, 0x001303bb, 0x001303ce, 0x001403e1, 0x001403f5, 0x00140409, + 0x0015041d, 0x00150432, 0x00160447, 0x0016045d, 0x00160473, 0x00170489, 0x001704a0, 0x001904b7, + 0x001804d0, 0x001904e8, 0x00190501, 0x001a051a, 0x001a0534, 0x001b054e, 0x001b0569, 0x001c0584, + 0x001c05a0, 0x001d05bc, 0x001e05d9, 0x001e05f7, 0x001e0615, 0x00200633, 0x00200653, 0x00200673, + 0x00210693, 0x002206b4, 0x002306d6, 0x002306f9, 0x0024071c, 0x00240740, 0x00260764, 0x0026078a, + 0x002607b0, 0x002807d6, 0x002907fe, 0x00290827, 0x002a0850, 0x002a087a, 0x002c08a4, 0x002c08d0, + 0x002e08fc, 0x002e092a, 0x002f0958, 0x00310987, 0x003109b8, 0x003209e9, 0x00330a1b, 0x00340a4e, + 0x00350a82, 0x00350ab7, 0x00380aec, 0x00380b24, 0x003a0b5c, 0x003a0b96, 0x003c0bd0, 0x003d0c0c, + 0x003e0c49, 0x003f0c87, 0x00400cc6, 0x00420d06, 0x00430d48, 0x00440d8b, 0x00460dcf, 0x00480e15, + 0x00480e5d, 0x00490ea5, 0x004c0eee, 0x004d0f3a, 0x004e0f87, 0x00500fd5, 0x00511025, 0x00531076, + 0x005610c9, 0x0056111f, 0x00581175, 0x005a11cd, 0x005c1227, 0x005e1283, 0x005e12e1, 0x0061133f, + 0x006413a0, 0x00651404, 0x00671469, 0x006914d0, 0x006c1539, 0x006c15a5, 0x00701611, 0x00721681, + 0x007416f3, 0x00761767, 0x007917dd, 0x007a1856, 0x007d18d0, 0x0080194d, 0x008319cd, 0x00841a50, + 0x00881ad4, 0x00891b5c, 0x008d1be5, 0x00911c72, 0x00911d03, 0x00961d94, 0x00981e2a, 0x009c1ec2, + 0x009e1f5e, 0x00a21ffc, 0x00a4209e, 0x00a92142, 0x00ab21eb, 0x00ae2296, 0x00b22344, 0x00b523f6, + 0x00b924ab, 0x00be2564, 0x00c02622, 0x00c526e2, 0x00c827a7, 0x00cc286f, 0x00d0293b, 0x00d52a0b, + 0x00d72ae0, 0x00dd2bb7, 0x00e12c94, 0x00e62d75, 0x00eb2e5b, 0x00ef2f46, 0x00f23035, 0x00f83127, + 0x00fe321f, 0x0101331d, 0x0108341e, 0x010c3526, 0x01123632, 0x01173744, 0x011c385b, 0x01233977, + 0x01273a9a, 0x012e3bc1, 0x01343cef, 0x013a3e23, 0x01403f5d, 0x0146409d, 0x014c41e3, 0x0154432f, + 0x01594483, 0x016145dc, 0x0168473d, 0x016f48a5, 0x01764a14, 0x017d4b8a, 0x01854d07, 0x018d4e8c, + 0x01945019, 0x019d51ad, 0x01a4534a, 0x01ad54ee, 0x01b5569b, 0x01be5850, 0x01c75a0e, 0x01d05bd5, + 0x01d85da5, 0x01e35f7d, 0x01eb6160, 0x01f6634b, 0x01ff6541, 0x02096740, 0x02146949, 0x021e6b5d, + 0x02296d7b, 0x02336fa4, 0x023f71d7, 0x024a7416, 0x02567660, 0x026278b6, 0x026d7b18, 0x027a7d85 + // clang-format on +}; + +/*** 1/(1+X) LUT table for int16 Softmax */ +static const uint32_t ONE_OVER_ONE_PLUS_X_LUT[] = { + // clang-format off + 0xffc17fff, 0xffc07fc0, 0xffc27f80, 0xffc07f42, 0xffc17f02, 0xffc17ec3, 0xffc27e84, 0xffc27e46, + 0xffc27e08, 0xffc37dca, 0xffc27d8d, 0xffc37d4f, 0xffc37d12, 0xffc37cd5, 0xffc37c98, 0xffc47c5b, + 0xffc47c1f, 0xffc47be3, 0xffc57ba7, 0xffc57b6c, 0xffc37b31, 0xffc67af4, 0xffc57aba, 0xffc67a7f, + 0xffc57a45, 0xffc67a0a, 0xffc779d0, 0xffc67997, 0xffc6795d, 0xffc77923, 0xffc778ea, 0xffc778b1, + 0xffc87878, 0xffc77840, 0xffc87807, 0xffc877cf, 0xffc97797, 0xffc87760, 0xffc97728, 0xffc976f1, + 0xffc976ba, 0xffc87683, 0xffca764b, 0xffca7615, 0xffca75df, 0xffca75a9, 0xffca7573, 0xffcb753d, + 0xffca7508, 0xffcb74d2, 0xffcb749d, 0xffca7468, 0xffcc7432, 0xffcc73fe, 0xffcb73ca, 0xffcc7395, + 0xffcd7361, 0xffcc732e, 0xffcc72fa, 0xffcd72c6, 0xffcd7293, 0xffcd7260, 0xffcc722d, 0xffce71f9, + 0xffcd71c7, 0xffce7194, 0xffce7162, 0xffce7130, 0xffcf70fe, 0xffce70cd, 0xffce709b, 0xffcf7069, + 0xffcf7038, 0xffcf7007, 0xffcf6fd6, 0xffcf6fa5, 0xffd06f74, 0xffd06f44, 0xffd06f14, 0xffd06ee4, + 0xffd06eb4, 0xffd06e84, 0xffd16e54, 0xffd16e25, 0xffd16df6, 0xffd16dc7, 0xffd06d98, 0xffd26d68, + 0xffd16d3a, 0xffd26d0b, 0xffd26cdd, 0xffd26caf, 0xffd26c81, 0xffd26c53, 0xffd36c25, 0xffd26bf8, + 0xffd36bca, 0xffd36b9d, 0xffd36b70, 0xffd26b43, 0xffd46b15, 0xffd36ae9, 0xffd46abc, 0xffd46a90, + 0xffd46a64, 0xffd46a38, 0xffd46a0c, 0xffd469e0, 0xffd469b4, 0xffd56988, 0xffd5695d, 0xffd56932, + 0xffd56907, 0xffd568dc, 0xffd568b1, 0xffd56886, 0xffd6685b, 0xffd56831, 0xffd66806, 0xffd667dc, + 0xffd667b2, 0xffd76788, 0xffd6675f, 0xffd76735, 0xffd6670c, 0xffd766e2, 0xffd666b9, 0xffd7668f, + 0xffd86666, 0xffd6663e, 0xffd86614, 0xffd765ec, 0xffd865c3, 0xffd8659b, 0xffd86573, 0xffd8654b, + 0xffd86523, 0xffd864fb, 0xffd964d3, 0xffd864ac, 0xffd96484, 0xffd8645d, 0xffd96435, 0xffd9640e, + 0xffd963e7, 0xffd963c0, 0xffd96399, 0xffda6372, 0xffd9634c, 0xffda6325, 0xffda62ff, 0xffda62d9, + 0xffda62b3, 0xffda628d, 0xffda6267, 0xffdb6241, 0xffda621c, 0xffdb61f6, 0xffda61d1, 0xffdc61ab, + 0xffd96187, 0xffdc6160, 0xffdb613c, 0xffdb6117, 0xffdb60f2, 0xffdc60cd, 0xffdc60a9, 0xffdb6085, + 0xffdc6060, 0xffdc603c, 0xffdc6018, 0xffdc5ff4, 0xffdc5fd0, 0xffdd5fac, 0xffdc5f89, 0xffdc5f65, + 0xffdd5f41, 0xffdd5f1e, 0xffdd5efb, 0xffdd5ed8, 0xffdd5eb5, 0xffdd5e92, 0xffdd5e6f, 0xffdd5e4c, + 0xffdd5e29, 0xffde5e06, 0xffde5de4, 0xffdd5dc2, 0xffde5d9f, 0xffde5d7d, 0xffde5d5b, 0xffde5d39, + 0xffdf5d17, 0xffde5cf6, 0xffde5cd4, 0xffdf5cb2, 0xffdf5c91, 0xffde5c70, 0xffdf5c4e, 0xffdf5c2d, + 0xffde5c0c, 0xffe05bea, 0xffdf5bca, 0xffdf5ba9, 0xffdf5b88, 0xffdf5b67, 0xffe05b46, 0xffe05b26, + 0xffdf5b06, 0xffe05ae5, 0xffe05ac5, 0xffe05aa5, 0xffe05a85, 0xffe05a65, 0xffe05a45, 0xffe15a25, + 0xffe05a06, 0xffe059e6, 0xffe159c6, 0xffe159a7, 0xffe05988, 0xffe15968, 0xffe15949, 0xffe1592a, + 0xffe1590b, 0xffe158ec, 0xffe258cd, 0xffe158af, 0xffe15890, 0xffe25871, 0xffe15853, 0xffe25834, + 0xffe25816, 0xffe257f8, 0xffe157da, 0xffe257bb, 0xffe3579d, 0xffe25780, 0xffe25762, 0xffe25744, + 0xffe35726, 0xffe25709, 0xffe256eb, 0xffe356cd, 0xffe356b0, 0xffe35693, 0xffe25676, 0xffe35658, + 0xffe3563b, 0xffe3561e, 0xffe35601, 0xffe355e4, 0xffe455c7, 0xffe355ab, 0xffe4558e, 0xffe35572, + 0xffe45555, 0xffe35539, 0xffe4551c, 0xffe45500, 0xffe454e4, 0xffe454c8, 0xffe454ac, 0xffe45490, + 0xffe45474, 0xffe55458, 0xffe4543d, 0xffe45421, 0xffe55405, 0xffe553ea, 0xffe453cf, 0xffe553b3, + 0xffe45398, 0xffe5537c, 0xffe55361, 0xffe55346, 0xffe5532b, 0xffe55310, 0xffe552f5, 0xffe552da, + 0xffe652bf, 0xffe552a5, 0xffe5528a, 0xffe6526f, 0xffe55255, 0xffe6523a, 0xffe65220, 0xffe55206, + 0xffe651eb, 0xffe651d1, 0xffe651b7, 0xffe6519d, 0xffe65183, 0xffe65169, 0xffe7514f, 0xffe65136, + 0xffe6511c, 0xffe75102, 0xffe650e9, 0xffe750cf, 0xffe650b6, 0xffe7509c, 0xffe75083, 0xffe6506a, + 0xffe75050, 0xffe75037, 0xffe7501e, 0xffe75005, 0xffe74fec, 0xffe74fd3, 0xffe74fba, 0xffe74fa1, + 0xffe84f88, 0xffe74f70, 0xffe84f57, 0xffe74f3f, 0xffe84f26, 0xffe74f0e, 0xffe84ef5, 0xffe84edd, + 0xffe84ec5, 0xffe84ead, 0xffe74e95, 0xffe84e7c, 0xffe84e64, 0xffe94e4c, 0xffe84e35, 0xffe84e1d, + 0xffe84e05, 0xffe94ded, 0xffe84dd6, 0xffe84dbe, 0xffe94da6, 0xffe94d8f, 0xffe84d78, 0xffe84d60, + 0xffea4d48, 0xffe84d32, 0xffe94d1a, 0xffe94d03, 0xffe84cec, 0xffe94cd4, 0xffe94cbd, 0xffea4ca6, + 0xffe94c90, 0xffe84c79, 0xffea4c61, 0xffe94c4b, 0xffe94c34, 0xffea4c1d, 0xffe94c07, 0xffea4bf0, + 0xffe94bda, 0xffea4bc3, 0xffea4bad, 0xffe94b97, 0xffea4b80, 0xffea4b6a, 0xffea4b54, 0xffea4b3e, + 0xffea4b28, 0xffea4b12, 0xffea4afc, 0xffea4ae6, 0xffea4ad0, 0xffeb4aba, 0xffea4aa5, 0xffea4a8f, + 0xffeb4a79, 0xffea4a64, 0xffea4a4e, 0xffeb4a38, 0xffeb4a23, 0xffea4a0e, 0xffeb49f8, 0xffea49e3, + 0xffeb49cd, 0xffeb49b8, 0xffeb49a3, 0xffeb498e, 0xffea4979, 0xffeb4963, 0xffeb494e, 0xffec4939, + 0xffeb4925, 0xffea4910, 0xffec48fa, 0xffeb48e6, 0xffeb48d1, 0xffec48bc, 0xffeb48a8, 0xffec4893, + 0xffeb487f, 0xffec486a, 0xffeb4856, 0xffec4841, 0xffec482d, 0xffeb4819, 0xffec4804, 0xffec47f0, + 0xffec47dc, 0xffec47c8, 0xffec47b4, 0xffec47a0, 0xffec478c, 0xffec4778, 0xffec4764, 0xffec4750, + 0xffec473c, 0xffed4728, 0xffec4715, 0xffec4701, 0xffed46ed, 0xffec46da, 0xffed46c6, 0xffec46b3, + 0xffec469f, 0xffed468b, 0xffed4678, 0xffec4665, 0xffed4651, 0xffed463e, 0xffed462b, 0xffec4618, + 0xffed4604, 0xffed45f1, 0xffed45de, 0xffed45cb, 0xffed45b8, 0xffed45a5, 0xffed4592, 0xffed457f, + 0xffee456c, 0xffed455a, 0xffed4547, 0xffed4534, 0xffee4521, 0xffed450f, 0xffed44fc, 0xffee44e9, + 0xffed44d7, 0xffee44c4, 0xffee44b2, 0xffed44a0, 0xffee448d, 0xffee447b, 0xffed4469, 0xffee4456, + 0xffee4444, 0xffee4432, 0xffee4420, 0xffee440e, 0xffee43fc, 0xffee43ea, 0xffee43d8, 0xffee43c6, + 0xffee43b4, 0xffee43a2, 0xffee4390, 0xffef437e, 0xffee436d, 0xffee435b, 0xffef4349, 0xffee4338, + 0xffee4326, 0xffef4314, 0xffee4303, 0xffef42f1, 0xffee42e0, 0xffef42ce, 0xffee42bd, 0xffef42ab, + 0xffef429a, 0xffee4289, 0xfff04277, 0xffee4267, 0xffef4255, 0xffef4244, 0xffef4233, 0xffef4222, + 0xffee4211, 0xffef41ff, 0xfff041ee, 0xffef41de, 0xffef41cd, 0xffee41bc, 0xfff041aa, 0xffef419a, + 0xffef4189, 0xffef4178, 0xfff04167, 0xffef4157, 0xffef4146, 0xfff04135, 0xffef4125, 0xfff04114, + 0xffef4104, 0xfff040f3, 0xffef40e3, 0xfff040d2, 0xfff040c2, 0xffef40b2, 0xfff040a1, 0xfff04091, + 0xfff04081, 0xffef4071, 0xfff04060, 0xfff04050, 0xfff04040, 0xfff04030, 0xfff04020, 0xfff04010 + // clang-format on +}; + +Softmax::Softmax(Architecture *arch, OptimiserDatabase *db) : _arch(arch), _db(db) +{ + assert(_arch != nullptr); +} + +Operation *Softmax::ConvertOp(Operation *const operation) +{ + auto returnOp = operation; + + if ( OpType::Softmax == operation->Type() ) + { + auto ifmConn = operation->Input(TensorUsage::IFM0); + auto ofmConn = operation->Output(TensorUsage::OFM); + auto ifm = ifmConn->tensor.get(); + auto ofm = ofmConn->tensor.get(); + + if ( ifm->Type() == ofm->Type() || (ifm->Type() == DataType::Int8 && ofm->Type() == DataType::Int16) ) + { + // Reshape if needed + auto fullShape = Shape::PadAxes(ifmConn->shape, 4, 1); + if ( fullShape.Batch() > 1 ) + { + fullShape = fullShape.WithHeight(fullShape.Batch() * fullShape.Height()).WithBatch(1); + } + ifmConn->shape = fullShape; + ofmConn->shape = std::move(fullShape); + + if ( ifm->Type() == DataType::Int8 || ifm->Type() == DataType::UInt8 ) + { + returnOp = GetGraph8Bit(operation, ifmConn, ofmConn); + } + else if ( ifm->Type() == DataType::Int16 ) + { + returnOp = GetGraphInt16(operation, ifmConn, ofmConn); + } + } + if ( operation != returnOp ) + { + operation->Disconnect(); + } + } + + return returnOp; +} + + +void Softmax::RecordOptimisation(Operation *const operation, Operation *op) +{ + if ( _db ) + { + _db->AddOptimised(operation, op); + } +} + +Operation *Softmax::GetGraph8Bit(Operation *const operation, TensorConnection *ifmConn, TensorConnection *ofmConn) +{ + const auto &ifmQuant = ifmConn->quantization; + auto expTable = GenerateExpTable(double(operation->Parameters().softmax.beta), ifmQuant.scales[0].Dequantize()); + auto noScaleQuant = ifmConn->quantization; + noScaleQuant.scales.clear(); + auto noScaleQuantZp0 = noScaleQuant; + noScaleQuantZp0.zeroPoints[0] = 0; + auto oneScaleQuant = ifmConn->quantization; + oneScaleQuant.scales[0] = {1, 0}; + oneScaleQuant.zeroPoints[0] = 0; + auto twoScaleQuant = oneScaleQuant; + twoScaleQuant.scales[0] = {2, 0}; + + // PASS 0 - Depthwise Maxpool + auto op = CreateDepthwiseMaxpool(ifmConn->tensor, ifmConn->shape, ifmConn->quantization, noScaleQuant); + op->SetRounding(RoundMode::DBL); + auto ifmMax = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 1 - Sub + auto subQuant = oneScaleQuant; + subQuant.zeroPoints[0] = 127; + op = CreateSub(ifmConn->tensor, ifmMax, ifmConn->quantization, noScaleQuant, subQuant, DataType::Int8, &ifmConn->shape); + op->SetRounding(RoundMode::DBL); + auto ifm_sub = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 1.5 - LUT(exp) + auto expLut = CreateConstTensor("exp_lut", DataType::Int32, std::make_shared(std::move(expTable))); + op = CreateLUT(ifm_sub, expLut, subQuant, subQuant); + auto ifm_exp = op->Output(TensorUsage::OFM)->tensor; + op->SetRounding(RoundMode::DBL); + RecordOptimisation(operation, op); + + // PASS 2 - ASR + auto right_shift12 = CreateConstTensor("right_shift12", 12); + op = CreateAsr(ifm_exp, right_shift12, subQuant, noScaleQuant, noScaleQuantZp0); + op->SetRounding(RoundMode::NATURAL); + op->attr.asr.round = true; + auto rescaled_exp = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 3 - Reduce sum + op = CreateReduceSum(rescaled_exp, noScaleQuantZp0, noScaleQuantZp0); + op->SetRounding(RoundMode::NATURAL); + auto sum_of_exp = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 4 - CLZ + op = CreateClz(sum_of_exp, noScaleQuantZp0, noScaleQuantZp0); + op->SetRounding(RoundMode::DBL); + auto headroom_plus_one = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 5 - Sub + auto headroom_offset = CreateConstTensor("headroom_offset", 12 + 31 - DataTypeSizeBits(ofmConn->tensor->Type())); + op = CreateSub(headroom_offset, headroom_plus_one, noScaleQuantZp0, noScaleQuantZp0, noScaleQuantZp0); + op->SetRounding(RoundMode::DBL); + auto right_shift = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 6 - Sub + auto one = CreateConstTensor("one_const", 1); + op = CreateSub(headroom_plus_one, one, noScaleQuantZp0, noScaleQuant, noScaleQuantZp0); + op->SetRounding(RoundMode::DBL); + auto headroom = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 7 - SHL + op = CreateShl(sum_of_exp, headroom, noScaleQuantZp0, noScaleQuantZp0, oneScaleQuant); + op->SetRounding(RoundMode::DBL); + auto half_denominator = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 8 - Multiply + auto neg_32_over_17 = CreateConstTensor("neg_32_over_17", -int32_t((32ULL << 29U) / 17U)); + op = CreateMul(half_denominator, neg_32_over_17, oneScaleQuant, oneScaleQuant, twoScaleQuant); + op->SetRounding(RoundMode::DBL); + auto rescaled = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 9 - Add + auto const_48_over_17 = CreateConstTensor("const_48_over_17", int32_t((48ULL << 29U) / 17U)); + op = CreateAdd(rescaled, const_48_over_17, twoScaleQuant, noScaleQuant, oneScaleQuant); + op->SetRounding(RoundMode::DBL); + auto rescale_w_offset = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 10 - 24 + auto nr_x = std::move(rescale_w_offset); + auto F2_one = CreateConstTensor("F2_one", 1 << 29); + auto four = CreateConstTensor("four", 4); + for ( int i = 0; i < 3; ++i ) + { + // PASS 10, 15, 20 - MUL + op = CreateMul(nr_x, half_denominator, oneScaleQuant, oneScaleQuant, twoScaleQuant); + op->SetRounding(RoundMode::DBL); + auto half_denominator_times_x = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 11, 16, 21 - SUB + op = CreateSub(F2_one, half_denominator_times_x, noScaleQuant, twoScaleQuant, oneScaleQuant); + op->SetRounding(RoundMode::DBL); + auto one_minus_half_denominator_times_x = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 12, 17, 22 - MUL + op = CreateMul(nr_x, one_minus_half_denominator_times_x, oneScaleQuant, oneScaleQuant, twoScaleQuant); + op->SetRounding(RoundMode::DBL); + auto to_rescale = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 13, 18, 23 - MUL + op = CreateMul(to_rescale, four, twoScaleQuant, noScaleQuant, noScaleQuantZp0); + op->SetRounding(RoundMode::DBL); + auto to_add = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 14, 19, 24 - ADD + op = CreateAdd(nr_x, to_add, oneScaleQuant, noScaleQuantZp0, oneScaleQuant); + op->SetRounding(RoundMode::DBL); + nr_x = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + } + + // PASS 25 - Multiply + op = CreateMul(ifm_exp, nr_x, oneScaleQuant, oneScaleQuant, oneScaleQuant); + op->SetRounding(RoundMode::DBL); + auto scaled_exp = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 26 - ASR + auto shrOp = std::make_shared(OpType::Asr); + op = shrOp.get(); + op->SetRounding(RoundMode::NATURAL); + op->attr.asr.round = true; + op->ConnectInput(TensorUsage::IFM, scaled_exp).Set(oneScaleQuant); + op->ConnectInput(TensorUsage::IFM1, right_shift).Set(noScaleQuantZp0); + op->ConnectOutput(TensorUsage::OFM, ofmConn->tensor).Set(ofmConn->quantization).Set(ofmConn->shape); + RecordOptimisation(operation, op); + + return op; +} + +Operation *Softmax::GetGraphInt16(Operation *const operation, TensorConnection *ifmConn, TensorConnection *ofmConn) +{ + auto noScaleQuant = ifmConn->quantization; + noScaleQuant.scales.clear(); + + // PASS 0 - Depthwise Maxpool + auto op = CreateDepthwiseMaxpool(ifmConn->tensor, ifmConn->shape, ifmConn->quantization, noScaleQuant); + op->SetRounding(RoundMode::NATURAL); + auto ifmMax = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 1 - Sub + op = CreateSub(ifmConn->tensor, ifmMax, ifmConn->quantization, noScaleQuant, ifmConn->quantization, DataType::Int32, + &ifmConn->shape); + op->SetRounding(RoundMode::DBL); + auto sub1_ofm = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 2 - Mul + double beta = double(operation->Parameters().softmax.beta); + double mul2_out_range = 10.0 / 65535.0; + auto quant = ElementwiseMulScale(ifmConn->quantization.scales[0].Dequantize(), beta, mul2_out_range); + auto scale_quant = ifmConn->quantization; + scale_quant.scales[0] = QuantizedScale(beta); + auto mul2_quant = ofmConn->quantization; + mul2_quant.scales[0] = QuantizedScale(mul2_out_range); + auto scale = CreateConstTensor("mul2_scale", quant.scale); + op = CreateMul(sub1_ofm, scale, ifmConn->quantization, scale_quant, mul2_quant); + op->SetRounding(RoundMode::DBL); + auto mul2_ofm = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 3 - Add + auto const_add = CreateConstTensor("add3_const", 32767); + op = CreateAdd(mul2_ofm, const_add, mul2_quant, noScaleQuant, mul2_quant, DataType::Int16); + op->SetRounding(RoundMode::DBL); + auto ifm_add = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 3.5 - LUT(exp) + auto expBuf = std::make_shared(int(std::size(EXP_LUT)), EXP_LUT, true); + auto expLut = CreateConstTensor("exp_lut", DataType::Int32, expBuf); + op = CreateLUT(ifm_add, expLut, mul2_quant, mul2_quant, DataType::Int16); + op->SetRounding(RoundMode::DBL); + auto ifm_exp = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 4 - Reduce sum + op = CreateReduceSum(ifm_exp, mul2_quant, noScaleQuant); + op->SetRounding(RoundMode::NATURAL); + auto sum_of_exp = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 5 - CLZ + op = CreateClz(sum_of_exp, noScaleQuant, noScaleQuant); + op->SetRounding(RoundMode::DBL); + auto headroom_plus_one = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 6 - Sub + auto const_31 = CreateConstTensor("const_31", 31); + op = CreateSub(const_31, headroom_plus_one, noScaleQuant, noScaleQuant, noScaleQuant); + op->SetRounding(RoundMode::DBL); + auto reciprocal_right_shift = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 7 - SHL + auto one = CreateConstTensor("one_const", 1); + op = CreateShl(one, reciprocal_right_shift, noScaleQuant, noScaleQuant, noScaleQuant); + op->SetRounding(RoundMode::DBL); + auto constant_one = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 8 - Sub + op = CreateSub(sum_of_exp, constant_one, noScaleQuant, noScaleQuant, noScaleQuant); + op->SetRounding(RoundMode::DBL); + auto sum_of_exps_minus_one = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // # PASS 9 - SHL + op = CreateShl(sum_of_exps_minus_one, headroom_plus_one, noScaleQuant, noScaleQuant, noScaleQuant); + op->SetRounding(RoundMode::DBL); + auto shifted_sum_minus_one = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 10 - ASR + auto shift = CreateConstTensor("shift_const", 15); + op = CreateAsr(shifted_sum_minus_one, shift, noScaleQuant, noScaleQuant, noScaleQuant); + op->SetRounding(RoundMode::NATURAL); + op->attr.asr.round = true; + auto shifted_sum_minus_one_16 = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 11 - Sub + auto sub11_const = CreateConstTensor("sub11_const", 32768); + op = CreateSub(shifted_sum_minus_one_16, sub11_const, noScaleQuant, noScaleQuant, noScaleQuant, DataType::Int16); + op->SetRounding(RoundMode::DBL); + auto reciprocal_scale = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 11.5 - LUT(one over one plus x) + auto oneOverOnePlusXBuf = std::make_shared(int(std::size(ONE_OVER_ONE_PLUS_X_LUT)), ONE_OVER_ONE_PLUS_X_LUT, true); + auto oneOverOnePlusXLut = CreateConstTensor("one_over_one_plus_x_lut", DataType::Int32, oneOverOnePlusXBuf); + op = CreateLUT(reciprocal_scale, oneOverOnePlusXLut, noScaleQuant, noScaleQuant, DataType::Int16); + op->SetRounding(RoundMode::DBL); + reciprocal_scale = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // # PASS 12 - Multiply + op = CreateMul(ifm_exp, reciprocal_scale, noScaleQuant, noScaleQuant, noScaleQuant, DataType::Int32); + op->SetRounding(RoundMode::DBL); + auto mul_ofm = op->Output(TensorUsage::OFM)->tensor; + RecordOptimisation(operation, op); + + // PASS 13 - ASR + auto shrOp = std::make_shared(OpType::Asr); + op = shrOp.get(); + op->SetRounding(RoundMode::NATURAL); + op->attr.asr.round = true; + op->ConnectInput(TensorUsage::IFM, mul_ofm).Set(noScaleQuant); + op->ConnectInput(TensorUsage::IFM1, reciprocal_right_shift).Set(noScaleQuant); + op->ConnectOutput(TensorUsage::OFM, ofmConn->tensor).Set(ofmConn->quantization).Set(ofmConn->shape); + RecordOptimisation(operation, op); + + return op; +} + +std::vector Softmax::GenerateExpTable(double beta, double inputScale) +{ + const int kTableSize = 256; + const int kIntegerBits = 5; + const int kSignedBits = 31; + std::vector expTable(kTableSize); + using FixedPoint = gemmlowp::FixedPoint; + + const double realBeta = std::min(beta * inputScale * (1 << (kSignedBits - kIntegerBits)), (1ll << kSignedBits) - 1.0); + const auto quant = QuantizedScale(realBeta); + const int leftShift = 31 - quant.shift; + const int diffMin = -int(std::floor(1.0 * ((1 << kIntegerBits) - 1) * (1 << (kSignedBits - kIntegerBits)) / (1U << leftShift))); + + for ( int x = 0; x < kTableSize; ++x ) + { + int inputDiff = x - 255; + if ( inputDiff >= diffMin ) + { + const int32_t inputDiffRescaled = gemmlowp::SaturatingRoundingDoublingHighMul( + ClampToType(inputDiff * (1LL << leftShift)), quant.scale); + expTable[x] = gemmlowp::exp_on_negative_values(FixedPoint::FromRaw(inputDiffRescaled)).raw(); + } + else + { + expTable[x] = 0; + } + } + + return expTable; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/softmax.hpp b/ethosu/regor/compiler/softmax.hpp new file mode 100644 index 00000000..88ecbb7e --- /dev/null +++ b/ethosu/regor/compiler/softmax.hpp @@ -0,0 +1,53 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/scaling.hpp" +#include "graph_optimiser.hpp" +#include "operation.hpp" +#include "operation_util.hpp" + +#include +#include +#include + +namespace regor +{ + +/// +/// TFLite Graph optimiser Softmax rewriter +/// +class Softmax +{ +private: + Architecture *_arch = nullptr; + OptimiserDatabase *_db = nullptr; + +public: + Softmax(Architecture *arch, OptimiserDatabase *db); + Operation *ConvertOp(Operation *const operation); + +private: + void RecordOptimisation(Operation *const operation, Operation *op); + Operation *GetGraph8Bit(Operation *const operation, TensorConnection *ifmConn, TensorConnection *ofmConn); + Operation *GetGraphInt16(Operation *const operation, TensorConnection *ifmConn, TensorConnection *ofmConn); + std::vector GenerateExpTable(double beta, double inputScale); +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/tensor.cpp b/ethosu/regor/compiler/tensor.cpp new file mode 100644 index 00000000..1b2fc6f6 --- /dev/null +++ b/ethosu/regor/compiler/tensor.cpp @@ -0,0 +1,118 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "compiler/tensor.hpp" + +#include "common/common.hpp" + +#include "architecture/architecture.hpp" +#include "common/buffer_view.hpp" +#include "common/data_type.hpp" +#include "common/shape.hpp" + +#include +#include +#include +#include + +namespace regor +{ + +Tensor::Tensor(const std::string &name, DataType type) : _name(name), _type(type), _uid(GenerateUniqueId()) +{ +} + +Tensor::Tensor(const std::string &name, DataType type, Shape shape) : + _name(name), _type(type), _uid(GenerateUniqueId()), _storageShape(std::move(shape)) +{ +} + +Tensor::Tensor(const std::string &name, DataType type, Shape shape, const std::shared_ptr &buffer) : + _name(name), _type(type), _uid(GenerateUniqueId()), _storageShape(shape), _buffer(buffer) +{ + assert(DataTypeStorageSizeBytes(type, shape.Elements()) <= buffer->Size()); +} + +BufferView Tensor::View() const +{ + int elementBits = DataTypeSizeBits(_type) > 0 ? DataTypeSizeBits(_type) : 8; + return BufferView(_buffer, 0, elementBits, _storageShape, Shape()); +} + +bool Tensor::IsConstant() const +{ + return _buffer && _buffer->Size(); +} + +void Tensor::Reshape(const Shape &shape) +{ + assert(shape.Elements() == StorageShape().Elements()); + SetStorageShape(shape); +} +void Tensor::ChangeType(DataType newType) +{ + assert(!IsConstant()); + _type = newType; +} + +void Tensor::AddReader(std::shared_ptr reader) +{ + if ( std::find(_readers.begin(), _readers.end(), reader) == _readers.end() ) + { + _readers.push_back(reader); + } +} +void Tensor::AddWriter(std::shared_ptr writer) +{ + if ( std::find(_writers.begin(), _writers.end(), writer) == _writers.end() ) + { + _writers.push_back(writer); + } +} +void Tensor::RemoveReader(std::shared_ptr reader) +{ + _readers.erase(std::remove(_readers.begin(), _readers.end(), reader), _readers.end()); +} +void Tensor::RemoveWriter(std::shared_ptr writer) +{ + _writers.erase(std::remove(_writers.begin(), _writers.end(), writer), _writers.end()); +} +void Tensor::RemoveReaders() +{ + _readers.clear(); +} +void Tensor::RemoveWriters() +{ + _writers.clear(); +} + +std::unique_ptr Tensor::Clone() const +{ + auto clone = std::make_unique(*this); + clone->_uid = GenerateUniqueId(); + clone->RemoveReaders(); + clone->RemoveWriters(); + return clone; +} + +std::string Tensor::ToString() const +{ + return fmt::format("", Name(), StorageShape().ToString(), DataTypeToString(Type())); +} + +} // namespace regor diff --git a/ethosu/regor/compiler/tensor.hpp b/ethosu/regor/compiler/tensor.hpp new file mode 100644 index 00000000..f83b0009 --- /dev/null +++ b/ethosu/regor/compiler/tensor.hpp @@ -0,0 +1,95 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "architecture/architecture.hpp" +#include "common/buffer_view.hpp" +#include "common/data_type.hpp" +#include "common/shape.hpp" +#include "include/graphapi.hpp" +#include "tensor_properties.hpp" + +#include +#include +#include + +namespace regor +{ + +class Operation; + +/// +/// Graph tensor representation +/// +class Tensor : public GraphApi::GraphTensor, public std::enable_shared_from_this +{ +private: + std::string _name; + DataType _type; + UniqueId _uid; + class Shape _storageShape; + std::shared_ptr _buffer; + enum AxisOrder _axisOrder = AxisOrder::Unknown; + const void *_passthrough = nullptr; // Original flatbuffer description of this tensor (if it was loaded from one) + + std::vector> _readers; + std::vector> _writers; + +public: + Tensor(const std::string &name, DataType type); + Tensor(const std::string &name, DataType type, Shape shape); + Tensor(const std::string &name, DataType type, Shape shape, const std::shared_ptr &buffer); + + const std::string &Name() const { return _name; } + void SetName(const std::string &name) { _name = name; } + DataType Type() const { return _type; } + UniqueId Uid() const { return _uid; } + + const Shape &StorageShape() const { return _storageShape; } + void SetStorageShape(const Shape &shape) { _storageShape = shape; } + void SetBuffer(const std::shared_ptr &buffer) { _buffer = buffer; } + + BufferView View() const; + bool IsConstant() const; + void Reshape(const Shape &shape); + void ChangeType(DataType newType); + + enum AxisOrder AxisOrder() const { return _axisOrder; } + void SetAxisOrder(enum AxisOrder axisOrder) { _axisOrder = axisOrder; } + + const void *Passthrough() const { return _passthrough; } + void SetPassthrough(const void *passthrough) { _passthrough = passthrough; } + + const std::vector> &Readers() const { return _readers; } + const std::vector> &Writers() const { return _writers; } + + void AddReader(std::shared_ptr reader); + void AddWriter(std::shared_ptr writer); + void RemoveReader(std::shared_ptr reader); + void RemoveWriter(std::shared_ptr writer); + void RemoveReaders(); + void RemoveWriters(); + + std::unique_ptr Clone() const; + std::string ToString() const; +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/tensor_allocator.cpp b/ethosu/regor/compiler/tensor_allocator.cpp new file mode 100644 index 00000000..510d3619 --- /dev/null +++ b/ethosu/regor/compiler/tensor_allocator.cpp @@ -0,0 +1,149 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "tensor_allocator.hpp" + +#include "common/common.hpp" +#include "common/logging.hpp" + +#include "architecture/architecture.hpp" +#include "common/bit_flags.hpp" +#include "common/numeric_util.hpp" +#include "hillclimb_allocator.hpp" +#include "live_range.hpp" + +namespace regor +{ + +namespace +{ + +// Implementation of the linear allocator +Address LinearAllocateLiveRanges(LiveRangeGraph &lrGraph, int alignment) +{ + Address address = 0; + for ( const auto &lr : lrGraph.LiveRanges() ) + { + lr->SetAddress(address); + address += RoundAway(lr->size, alignment); + } + return address; +} + +void PrintAllocation(LiveRangeGraph &lrGraph, Address totalSize) +{ + LOG_PRINT("{0:10} - {1:10}: {2:>10} - {3:>10}: {4:11}: {5:12} : {6}\n", "Start Time", "End Time", "Start Addr", + "End Addr", "Tensor Size", "Memory Usage", "Name"); + auto lrs = lrGraph.LiveRanges(); + // Sort LiveRanges + std::sort(lrs.begin(), lrs.end(), + [](const std::shared_ptr &a, const std::shared_ptr &b) + { + if ( a->startTime != b->startTime ) return a->startTime < b->startTime; + return a->endTime < b->endTime; + }); + // Create memory histogram to track usage over time + std::vector memHist(lrGraph.EndTime(), 0); + for ( const auto &lr : lrs ) + { + for ( int t = lr->startTime; t <= lr->endTime; ++t ) + { + memHist[t] += lr->size; + } + } + + for ( const auto &lr : lrs ) + { + if ( lr->tensors.empty() ) + { + continue; + } + auto address = (*lr->tensors.begin())->allocatedAddress; + auto peakUsageDuringLiveRange = *std::max_element(memHist.begin() + lr->startTime, memHist.begin() + lr->endTime + 1); + for ( const auto &tens : lr->tensors ) + { + LOG_PRINT("{0:10} - {1:10}: {2:#10x} - {3:#10x}: {4:11}: {5:12} : {6}\n", lr->startTime, lr->endTime, + address, address + lr->size, lr->size, peakUsageDuringLiveRange, tens->Name()); + } + } + LOG_PRINT("Allocation Peak Tensor Size: {} bytes == {} KiB\n", totalSize, double(totalSize) / 1024.0); +} + +Address Allocate(LiveRangeGraph &lrGraph, const std::vector> &schedOps, + Schedule *schedule, const MemArea &targetMemory, TensorAllocator allocator, int alignment, Address sizeLimit) +{ + lrGraph.ExtractLiveRangesFromCascades(schedOps, schedule, targetMemory, false); + Address totalSize = 0; + if ( allocator == TensorAllocator::LinearAlloc ) + { + totalSize = LinearAllocateLiveRanges(lrGraph, alignment); + } + else if ( allocator == TensorAllocator::HillClimb ) + { + totalSize = HillClimbAllocateLiveRanges(lrGraph, alignment, sizeLimit); + } + return totalSize; +} + +} // namespace + +Address IncrementalLinearAllocator::Allocate(LiveRangeGraph *lrGraph, int alignment, bool verboseAllocation) +{ + for ( const auto &lr : lrGraph->LiveRanges() ) + { + if ( !lr->tensors.empty() ) + { + auto tensor = *(lr->tensors.begin()); + auto it = _allocatedAddresses.find(tensor->equivalenceId); + if ( it == _allocatedAddresses.end() ) + { + lr->SetAddress(_highestAddress); + _allocatedAddresses[tensor->equivalenceId] = _highestAddress; + _highestAddress += RoundAway(lr->size, alignment); + } + else + { + // An equivalent tensor has previously been allocated, reuse its address + lr->SetAddress(it->second); + } + } + } + if ( verboseAllocation ) + { + LOG_PRINT("{0:#^{1}}\n", "", 80); + LOG_PRINT("Tensor Allocation for {}:\n", _name); + PrintAllocation(*lrGraph, _highestAddress); + } + return _highestAddress; +} + +void AllocateTensors(const std::vector> &schedOps, Schedule *schedule, + const MemArea &memArea, TensorAllocator allocator, int alignment, bool verboseAllocation, Address sizeLimit) +{ + LiveRangeGraph lrGraph; + auto totalSize = Allocate(lrGraph, schedOps, schedule, memArea, allocator, alignment, sizeLimit); + if ( verboseAllocation ) + { + LOG_PRINT("{0:#^{1}}\n", "", 80); + LOG_PRINT("Allocation, memory {}, usage mask: {}\n", memArea.memory->Name(), memArea.usage.ToString()); + PrintAllocation(lrGraph, totalSize); + } + schedule->memoryUsage[memArea] = int(totalSize); +} + +} // namespace regor diff --git a/ethosu/regor/compiler/tensor_allocator.hpp b/ethosu/regor/compiler/tensor_allocator.hpp new file mode 100644 index 00000000..14a62f80 --- /dev/null +++ b/ethosu/regor/compiler/tensor_allocator.hpp @@ -0,0 +1,64 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "architecture/architecture.hpp" +#include "scheduler.hpp" +#include "scheduler_operation.hpp" + +#include +#include + +namespace regor +{ + +class LiveRangeGraph; + +// Tensor allocation algorithms +enum class TensorAllocator : uint16_t +{ + // Allocator that does not reuse memory + LinearAlloc = 0, + // Search based allocator + HillClimb = 1, + Last, +}; + +/// +/// Linear allocator that can be used to allocate addresses across multiple subgraphs +/// +class IncrementalLinearAllocator +{ +public: + IncrementalLinearAllocator(const std::string &name) : _name(name) {} + Address Allocate(LiveRangeGraph *lrGraph, int alignment, bool verboseAllocation); + +private: + std::string _name; + // Map from tensor's equivalence id to allocated address + std::unordered_map _allocatedAddresses; + Address _highestAddress = 0; +}; + +// Allocates addresses to the tensors involved in the given operations/mem area(s) +// using the given tensor allocation algorithm. +void AllocateTensors(const std::vector> &schedOps, Schedule *schedule, const MemArea &memArea, + TensorAllocator allocator, int alignment, bool verboseAllocation, Address sizeLimit = std::numeric_limits
::max()); + +} // namespace regor diff --git a/ethosu/regor/compiler/tensor_properties.hpp b/ethosu/regor/compiler/tensor_properties.hpp new file mode 100644 index 00000000..fa45f5d3 --- /dev/null +++ b/ethosu/regor/compiler/tensor_properties.hpp @@ -0,0 +1,90 @@ +// +// SPDX-FileCopyrightText: Copyright 2023-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/common.hpp" + +#include "include/graphapi.hpp" + +namespace regor +{ + +// Aliased by value (using/typedef doesn't work) +enum class AxisOrder : int16_t +{ + Unknown = int16_t(GraphApi::AxisOrder::Unknown), + OHWI = int16_t(GraphApi::AxisOrder::OHWI), + IHWO = int16_t(GraphApi::AxisOrder::IHWO), + OI = int16_t(GraphApi::AxisOrder::OI), +}; + +/// +/// Classification for how a Tensor is consumed by an operator. +/// +enum class TensorUsage : uint32_t +{ + None = 0, + IFM = 0x01, + OFM = 0x02, + Weights = 0x03, + Scales = 0x04, + Params = 0x05, + LUT = 0x06, + Commands = 0x07, + State = 0x08, + Last, + TypeMask = 0x0F, + IndexShift = 8, + IndexMask = 0xFFFFF00, + IFM0 = IFM, + IFM1 = 0x0100 | IFM, + IFM2 = 0x0200 | IFM, + Params0 = Params, + Params1 = 0x100 | Params, +}; + +DECLARE_ENUM_AS_FLAGS(TensorUsage) + +constexpr int MAX_NUM_IFM = 3; + +constexpr inline bool IsOFM(TensorUsage usage) +{ + return (usage & TensorUsage::TypeMask) == TensorUsage::OFM; +} + +constexpr inline bool IsIFM(TensorUsage usage) +{ + return (usage & TensorUsage::TypeMask) == TensorUsage::IFM; +} + +constexpr inline TensorUsage MakeTensorUsage(TensorUsage type, int index) +{ + return TensorUsage(uint32_t(type) | (index << 8)); +} + +constexpr inline int GetUsageIndex(TensorUsage usage) +{ + return unsigned(usage & TensorUsage::IndexMask) >> unsigned(TensorUsage::IndexShift); +} +constexpr inline TensorUsage GetUsageType(TensorUsage usage) +{ + return (usage & TensorUsage::TypeMask); +} + +} // namespace regor diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.cpp b/ethosu/regor/compiler/tflite_graph_optimiser.cpp new file mode 100644 index 00000000..6dde3b4e --- /dev/null +++ b/ethosu/regor/compiler/tflite_graph_optimiser.cpp @@ -0,0 +1,2936 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "compiler/tflite_graph_optimiser.hpp" + +#include "common/logging.hpp" + +#include "architecture/architecture.hpp" +#include "common/reverse_type.hpp" +#include "common/scaling.hpp" +#include "common/transpose_type.hpp" +#include "graph.hpp" +#include "graph_optimiser.hpp" +#include "op_type.hpp" +#include "operation.hpp" +#include "optimiser_utils.hpp" +#include "softmax.hpp" +#include "tensor.hpp" +#include "tflite/tflite_schema_generated.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace regor +{ + +using namespace GraphOptimisation; + +// Is the scaling of Tensor connection a and b valid and equal. +bool TFLiteGraphOptimiser::IsScalingValidAndEqual(const TensorConnection &a, const TensorConnection &b) +{ + return (a.quantization.IsValid() && b.quantization.IsValid() && a.quantization.scales == b.quantization.scales && + a.quantization.zeroPoints == b.quantization.zeroPoints); +} + + +// Multiplies int with QuantizedScale with rounding. +int TFLiteGraphOptimiser::MultiplyByQuantizedMultiplier(int x, QuantizedScale quantScale) +{ + // Multiplies x (int32) by QuantizedScale (scale, shift), returns rounded result. + // Expects the QuantizedScale to be left-shift positive. + const int leftShift = quantScale.shift > 0 ? quantScale.shift : 0; + const int rightShift = quantScale.shift < 0 ? -quantScale.shift : 0; + const std::int32_t mul = gemmlowp::SaturatingRoundingDoublingHighMul(x * (1 << leftShift), quantScale.scale); + return gemmlowp::RoundingDivideByPOT(mul, rightShift); +} + +Operation *TFLiteGraphOptimiser::MakeMulWithConstTensor(const std::string &name, const TensorConnection &ifmConn, + const TensorConnection &ofmConn, const std::shared_ptr &constTens, const Quantization &quantization) +{ + auto ofm = ofmConn.tensor; + auto op = std::make_shared(OpType::Mul); + op->SetRounding(RoundMode::DBL); + + op->CopyInput(TensorUsage::IFM0, ifmConn); + op->ConnectInput(TensorUsage::IFM1, constTens).Set(quantization); + + auto ofmName = ofm->Name(); + ofmName.append("_"); + ofmName.append(name); + + std::shared_ptr cloneOfm = ofm->Clone(); + cloneOfm->SetName(ofmName); + op->ConnectOutput(TensorUsage::OFM, cloneOfm).Set(ofmConn.shape).Set(ofmConn.quantization).Set(ofmConn.slice); + + return op.get(); +} + +Operation *TFLiteGraphOptimiser::MakeOperation( + OpType opType, const TensorConnection *ifm0Conn, const TensorConnection *ifm1Conn, const TensorConnection *ofmConn) +{ + auto op = std::make_shared(opType); + assert(ifm0Conn != nullptr); + assert(ofmConn != nullptr); + op->CopyInput(TensorUsage::IFM0, *ifm0Conn); + op->CopyOutput(TensorUsage::OFM, *ofmConn); + if ( ifm1Conn != nullptr ) + { + op->CopyInput(TensorUsage::IFM1, *ifm1Conn); + } + op->SetRounding(RoundMode::DBL); + return op.get(); +} + +// Converts LeakyReLU to +// if alpha <= 1 +// Max(alpha * IFM, identity * IFM) +// else +// Min(alpha * IFM, identity * IFM) +Operation *TFLiteGraphOptimiser::ConvertLeakyRelu16bit(TensorConnection &ifmConn, TensorConnection &ofmConn, Operation *operation) +{ + Operation *returnOp = operation; + + auto ifm = ifmConn.tensor.get(); + auto ofm = ofmConn.tensor.get(); + float alpha = operation->Parameters().leaky_relu.alpha; + int scalar = 1; + + auto alphaQuant = ifmConn.quantization; + alphaQuant.quantMin = {0}; + alphaQuant.quantMax = {int64_t(alpha * IntegerMax(ifmConn.tensor->Type()))}; + alphaQuant.zeroPoints[0] = 0; + alphaQuant.scales[0] = QuantizedScale(alpha); + + if ( alpha < 0 ) + { + scalar = -1; + alphaQuant.scales[0].scale *= -1; + } + + // Multiply all values with alpha + auto fmAlpha = CreateConstTensor("lrelu_alpha", int16_t(scalar)); + auto alphaMulOp = MakeMulWithConstTensor("alpha", ifmConn, ofmConn, fmAlpha, alphaQuant); + + RecordOptimisation(operation, alphaMulOp); + TensorConnection *identityConn = &ifmConn; + + if ( !IsScalingValidAndEqual(ifmConn, ofmConn) ) + { + // Identity operation is introduced to handle rescaling of the IFM + auto identityQuant = ifmConn.quantization; + identityQuant.quantMin = {0}; + identityQuant.quantMax = {int64_t(IntegerMax(ifmConn.tensor->Type()))}; + identityQuant.zeroPoints[0] = 0; + identityQuant.scales[0] = {1, 0}; + + auto fmIdentity = CreateConstTensor("lrelu_ident", int16_t(1)); + + auto identityMulOp = MakeMulWithConstTensor("identity", ifmConn, ofmConn, fmIdentity, identityQuant); + RecordOptimisation(operation, identityMulOp); + identityConn = identityMulOp->Output(TensorUsage::OFM); + } + + // Merge scaled and unscaled values with a MIN or a MAX depending on alpha + if ( alpha <= 1 ) + { + // If alpha <= 1 + // Max(negative * alpha, negative) = negative * alpha + // Max(positive * alpha, positive) = positive + auto maxOp = MakeOperation(OpType::Maximum, alphaMulOp->Output(TensorUsage::OFM), identityConn, &ofmConn); + RecordOptimisation(operation, maxOp); + returnOp = maxOp; + } + else + { + // If alpha > 1: + // Min(negative * alpha, negative) = negative * alpha + // Min(positive * alpha, positive) = positive + auto minOp = MakeOperation(OpType::Minimum, alphaMulOp->Output(TensorUsage::OFM), identityConn, &ofmConn); + RecordOptimisation(operation, minOp); + returnOp = minOp; + } + + return returnOp; +} + + +// Get axis parameter for operator +int TFLiteGraphOptimiser::GetAxis(const Operation *const operation) +{ + auto opType = operation->Type(); + int axis = 0; + + switch ( opType ) + { + case OpType::Concat: + case OpType::ConcatTFLite: + axis = operation->Parameters().concat.axis; + break; + case OpType::Pack: + case OpType::Unpack: + axis = operation->Parameters().pack_unpack.axis; + break; + case OpType::Split: + { + auto *paramConn = operation->Input(TensorUsage::Params); + axis = paramConn->tensor->View().Values()[0]; + break; + } + case OpType::SplitV: + { + auto usage = MakeTensorUsage(TensorUsage::Params, 1); + auto *paramConn = operation->Input(usage); + axis = paramConn->tensor->View().Values()[0]; + break; + } + default: + break; + } + return axis; +} + + +// Calculate the read shape and offset values for Slice. +void TFLiteGraphOptimiser::SetSliceOffsetValues(Operation *const operation, Shape &readShape, Shape &readOffset) +{ + auto *beginConn = operation->Input(TensorUsage::Params0); + auto *sizeConn = operation->Input(TensorUsage::Params1); + + for ( auto idx = 0; idx < beginConn->tensor->View().ViewShape()[0]; idx++ ) + { + auto begin = beginConn->tensor->View().Values()[idx]; + auto size = sizeConn->tensor->View().Values()[idx]; + readOffset[idx] = begin; + readShape[idx] = size; + } + + readOffset = Shape::PadAxes(readOffset, 4, 0); + readShape = Shape::PadAxes(readShape, 4, 1); +} + + +// Calculate the read shape and offset values for StridedSlice. +void TFLiteGraphOptimiser::SetStridedSliceOffsetValues( + Operation *const operation, const TensorConnection *const ifmConn, Shape &readShape, Shape &readOffset) +{ + auto *beginConn = operation->Input(TensorUsage::Params0); + auto *endConn = operation->Input(TensorUsage::Params1); + + // strides tensor not used. + auto beginMask = operation->Parameters().strided_slice.begin_mask; + auto endMask = operation->Parameters().strided_slice.end_mask; + + readShape = ifmConn->shape; + + for ( auto idx = 0; idx < ifmConn->shape.Size(); idx++ ) + { + // If the i:th bit in the mask is set then the value on offset_tens[i] should be ignored + if ( (beginMask & (1 << idx)) == 0 ) + { + readOffset[idx] = beginConn->tensor->View().Values()[idx]; + if ( readOffset[idx] < 0 ) + { + // Convert offset to positive value + readOffset[idx] += ifmConn->shape[idx]; + } + } + if ( (endMask & (1 << idx)) == 0 ) + { + readShape[idx] = endConn->tensor->View().Values()[idx]; + if ( readShape[idx] < 0 ) + { + // Convert offset to positive value + readShape[idx] += ifmConn->shape[idx]; + } + } + } + readOffset = Shape::PadAxes(readOffset, 4, 0); +} + + +// Creates MemoryCopy operation for the given ifm/ofm and write offset. +std::shared_ptr TFLiteGraphOptimiser::MakeMemoryCopyForConcat( + const TensorConnection *const ofmConn, const TensorConnection *const ifmConn, const Shape &writeOffset) +{ + auto op = std::make_shared(OpType::MemoryCopy); + op->SetRounding(RoundMode::NATURAL); + + op->CopyInput(TensorUsage::IFM0, *ifmConn); + op->ConnectOutput(TensorUsage::OFM, ofmConn->tensor) + .Set(ofmConn->shape) + .Set(ofmConn->quantization) + .Set({writeOffset, ifmConn->shape}); + + return op; +} + + +// Creates a MemoryCopy operation for the given ifm/ofm and readOffset. +std::shared_ptr TFLiteGraphOptimiser::MakeMemoryCopyForSplitOps(const TensorConnection *const ofmConn, + const TensorConnection *const ifmConn, const Shape &readShape, const Shape &readOffset) +{ + auto op = std::make_shared(OpType::MemoryCopy); + op->SetRounding(RoundMode::NATURAL); + op->ConnectInput(TensorUsage::IFM0, ifmConn->tensor).Set(ifmConn->shape).Set(ifmConn->quantization).Set({readOffset, readShape}); + op->CopyOutput(TensorUsage::OFM, *ofmConn); + + return op; +} + + +// Creates the desired shape of either: +// - Concat (Input shape - supply IFM base shape) +// - Split/SplitV (Output shape - supply OFM base shape) +// +// returns the Desired shape. +// Also calculates the axis4D, returned through supplied pointer. +Shape TFLiteGraphOptimiser::MakeConcatSplitDesiredShape(int axis, const Shape &baseShape, int *const axis4D) +{ + // Convert axis to positive. + if ( axis < 0 ) + { + axis += baseShape.Size(); + } + int to4D = (4 - baseShape.Size()); + *axis4D = axis + to4D; + return Shape::PadAxes(baseShape, 4, 1); +} + + +// Creates the desired shape of either: +// - pack (Input shape - supply IFM base shape) +// - unpack (Output shape - supply OFM base shape) +// +// returns the Desired shape. +// Unpack keeps the unpacked dimension set to 1. +// Also calculates the axis4D, returned through supplied pointer. +Shape TFLiteGraphOptimiser::MakePackUnpackDesiredShape(int axis, const Shape &baseShape, int *const axis4D) +{ + // Convert axis to positive. + if ( axis < 0 ) + { + axis += baseShape.Size() + 1; + } + Shape tmp = baseShape; + tmp = tmp.Insert(axis, 1); + int to4D = (4 - tmp.Size()); + *axis4D = axis + to4D; + return Shape::PadAxes(tmp, 4, 1); +} + + +// Creates the desired Output shape of StridedSlice. +// +// returns the Desired shape. +Shape TFLiteGraphOptimiser::MakeStridedSliceDesiredShape(Operation *const operation, const Shape &baseShape) +{ + auto newMask = unsigned(operation->Parameters().strided_slice.new_axis_mask); + auto shrinkMask = unsigned(operation->Parameters().strided_slice.shrink_axis_mask); + + if ( newMask == 0 && shrinkMask == 0 ) + { + return baseShape; + } + assert((newMask == 0) || (shrinkMask == 0)); + + Shape tmp = baseShape; + while ( shrinkMask ) + { + auto prevMask = shrinkMask; + shrinkMask &= shrinkMask - 1; + auto axis = 0; + auto diff = prevMask - shrinkMask; + diff >>= 1; + while ( diff ) + { + diff >>= 1; + ++axis; + } + tmp = tmp.Insert(axis, 1); + } + + while ( newMask ) + { + auto prevMask = newMask; + newMask &= newMask - 1; + auto axis = 0; + auto diff = prevMask - newMask; + diff >>= 1; + while ( diff ) + { + diff >>= 1; + ++axis; + } + tmp = tmp.Erase(axis); + newMask >>= 1; + } + + return Shape::PadAxes(tmp, 4, 1); +} + + +// Move Split/slice op to consumer +void TFLiteGraphOptimiser::MoveToConsumer(const Operation *const operation, Operation *const cons) +{ + auto *ifmConn = operation->Input(TensorUsage::IFM0); + auto *ofm = operation->OFM(); + auto *consIfm0 = cons->IFM(0); + auto *consIfm1 = cons->IFM(1); + + if ( consIfm0 == ofm ) + { + cons->CopyInput(TensorUsage::IFM0, *ifmConn); + } + else if ( consIfm1 != nullptr && IsBinaryElementwise(cons->Type()) && consIfm1 == ofm ) + { + cons->CopyInput(TensorUsage::IFM1, *ifmConn); + } +} + +void TFLiteGraphOptimiser::ReplaceOperation(Operation *const operationToReplace, Operation *const newOperation) +{ + auto oldOperation = operationToReplace->shared_from_this(); + + for ( const auto &input : oldOperation->Inputs().pairs() ) + { + newOperation->CopyInput(input.first, input.second); + } + for ( const auto &output : oldOperation->Outputs().pairs() ) + { + newOperation->CopyOutput(output.first, output.second); + } + oldOperation->Disconnect(); +} + +Operation *TFLiteGraphOptimiser::MakeDepthwiseMeanOp(const TensorConnection *ifmConn, const Shape &ifmShape4D, const Shape &readShape, + const Shape &readOffset, const Shape &ofmShape4D, int w, int h, const std::string &name, std::shared_ptr &weightTensor, + std::shared_ptr biasTensor, const Quantization &ifmQuant, const Quantization &weightQuant, const Quantization &ofmQuant) +{ + auto ifm = ifmConn->tensor; + auto op = std::make_shared(OpType::DepthwiseConv2DBias); + op->SetRounding(ifm->Type() == DataType::Int16 ? RoundMode::NATURAL : RoundMode::DBL); + op->SetKernel(std::make_unique(Point2i(w, h), Point2i(1, 1), Point2i(1, 1))); + + if ( weightTensor == nullptr ) + { + Shape weightShape(ifmShape4D.Batch(), h, w, ifmShape4D.Depth()); + std::vector ones(weightShape.Elements(), 1); + auto onesBuf = std::make_shared(std::move(ones)); + weightTensor = std::make_shared(name + "_weights", DataType::UInt8, weightShape, onesBuf); + weightTensor->SetAxisOrder(AxisOrder::IHWO); + } + + if ( biasTensor == nullptr ) + { + DataType biasType; + std::shared_ptr buf; + auto elems = ifmShape4D.Depth(); + if ( ifm->Type() == DataType::Int16 ) + { + biasType = DataType::Int64; + std::vector data(ToUnsigned(elems)); + buf = std::make_shared(std::move(data)); + } + else + { + biasType = DataType::Int32; + std::vector data(ToUnsigned(elems)); + buf = std::make_shared(std::move(data)); + } + biasTensor = std::make_shared(name + "bias", biasType, Shape(ifmShape4D.Depth()), buf); + } + + auto ifmQuantZp0 = ifmQuant; + ifmQuantZp0.zeroPoints.clear(); + ifmQuantZp0.zeroPoints.push_back(0); + op->ConnectInput(TensorUsage::IFM, ifm).Set(ifmShape4D).Set(ifmQuant).Set({readOffset, readShape}); + op->ConnectInput(TensorUsage::Weights, weightTensor).Set(weightQuant); + op->ConnectInput(TensorUsage::Scales, biasTensor).Set(ifmQuantZp0); + + auto ofm = std::make_shared(name + "_intermediate", DataType::Int32); + ofm->SetStorageShape(ofmShape4D); + op->ConnectOutput(TensorUsage::OFM, ofm).Set(ofmQuant); + + return op.get(); +} + + +// Upcast to int32 +Operation *TFLiteGraphOptimiser::CreateCastToInt32(const TensorConnection *ifmConn) +{ + assert(ifmConn->tensor->Type() != DataType::Int32); + + auto noScaleQuantZp0 = ifmConn->quantization; + noScaleQuantZp0.scales.clear(); + noScaleQuantZp0.zeroPoints.clear(); + noScaleQuantZp0.zeroPoints.push_back(0); + + auto ofmShape4D = Shape::PadAxes(ifmConn->shape, 4, 1); + auto op = std::make_shared(OpType::MemoryCopy); + op->SetRounding(RoundMode::NATURAL); + op->CopyInput(TensorUsage::IFM0, *ifmConn); + auto ofm = std::make_shared(ifmConn->tensor->Name() + "_32bit", DataType::Int32); + ofm->SetStorageShape(ofmShape4D); + op->ConnectOutput(TensorUsage::OFM, ofm).Set(noScaleQuantZp0); + return op.get(); +} + + +// Converts op to int8/uint8 LUT which is generated with the given function. +Operation *TFLiteGraphOptimiser::ConvertToLUT8(Operation *op, std::function func, const std::string &name) +{ + auto ifmConn = op->Input(TensorUsage::IFM0); + auto ofmConn = op->Output(TensorUsage::OFM); + auto ifm = ifmConn->tensor; + auto ofm = ofmConn->tensor; + + if ( (ifm->Type() != DataType::Int8 && ifm->Type() != DataType::UInt8) || ifm->Type() != ofm->Type() ) + { + return op; + } + + // Generate LUT + double ifmScale(ifmConn->quantization.scales[0].Dequantize()); + double ofmScale(ofmConn->quantization.scales[0].Dequantize()); + auto zpIn = ifmConn->quantization.zeroPoints[0]; + auto zpOut = ofmConn->quantization.zeroPoints[0]; + int qMin = ifm->Type() == DataType::Int8 ? -128 : 0; + int qMax = ifm->Type() == DataType::Int8 ? 127 : 255; + + std::vector lut; + lut.reserve(256); + for ( int x = qMin; x <= qMax; ++x ) + { + auto xReal = ifmScale * double(x - zpIn); + auto yReal = func(xReal); + int lutVal = int(RoundAwayZero(double(zpOut) + yReal / ofmScale)); + lutVal = std::min(qMax, std::max(qMin, lutVal)); + lut.push_back(uint8_t(lutVal)); + } + auto lutTens = CreateConstTensor(name, ifmConn->tensor->Type(), std::make_shared(std::move(lut))); + // The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale), + // so even if the OFM has a different scale than the IFM, the generated OFM scale instructions + // should be the same as the IFM + auto returnOp = CreateLUT(ifmConn->tensor, lutTens, ifmConn->quantization, ifmConn->quantization, lutTens->Type(), + &ifmConn->shape, ofmConn->tensor, ifmConn->slice, ofmConn->slice); + returnOp->SetRounding(RoundMode::NATURAL); + return returnOp; +} + +// Converts op to int16 interpolating LUT which is generated with the given function. +Operation *TFLiteGraphOptimiser::ConvertToInterpolatingLUT16(Operation *op, std::function func, const std::string &name) +{ + auto ifmConn = op->Input(TensorUsage::IFM0); + auto ofmConn = op->Output(TensorUsage::OFM); + auto ifm = ifmConn->tensor; + auto ofm = ofmConn->tensor; + + if ( (ifm->Type() != DataType::Int16) || ifm->Type() != ofm->Type() ) + { + return op; + } + + double ifmScale(ifmConn->quantization.scales[0].Dequantize()); + double ofmScale(ofmConn->quantization.scales[0].Dequantize()); + auto zpIn = ifmConn->quantization.zeroPoints[0]; + auto zpOut = ofmConn->quantization.zeroPoints[0]; + double qMin = IntegerMin(DataType::Int16); + double qMax = IntegerMax(DataType::Int16); + double inputMin = ifmScale * (qMin - zpIn); + double inputMax = ifmScale * (qMax - zpIn); + double outputMin = ofmScale * (qMin - zpOut); + double outputMax = ofmScale * (qMax - zpOut); + const int steps = 512; + double step = (inputMax - inputMin) / steps; + double halfStep = step / 2.0; + double outputScalingInv = (qMax - qMin + 1) / (outputMax - outputMin); + + // Create 32-bit LUT represented by a 16-bit base and 16-bit slope. + auto lut = std::make_unique(512); + double prevLutResult = 0; + for ( int i = 0; i < steps; i++ ) + { + double val = func(inputMin + i * step); + double valMidpoint = func(inputMin + i * step + halfStep); + double valNext = func(inputMin + (i + 1) * step); + double sampleVal = RoundAwayZero(val * outputScalingInv); + + double midpointInterpVal = RoundAwayZero((valNext * outputScalingInv + sampleVal) / 2); + double midpointVal = RoundAwayZero(valMidpoint * outputScalingInv); + double midpointErr = midpointInterpVal - midpointVal; + double bias = RoundAwayZero(midpointErr / 2.0); + + double lutResult = std::clamp(sampleVal - bias, qMin, qMax); + + if ( i > 0 ) + { + uint32_t base = uint32_t(prevLutResult); + uint32_t slope = uint32_t(lutResult - prevLutResult); + lut[i - 1] = base + (slope << 16); + } + prevLutResult = lutResult; + } + double val = RoundAwayZero(func(inputMax) * outputScalingInv); + double lutResult = std::clamp(val, qMin, qMax); + uint32_t base = uint32_t(prevLutResult); + uint32_t slope = uint32_t(lutResult - prevLutResult); + lut[steps - 1] = base + (slope << 16); + + auto lutTens = CreateConstTensor(name, DataType::Int32, std::make_shared(std::move(lut), 512)); + // The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale), + // so even if the OFM has a different scale than the IFM, the generated OFM scale instructions + // should be the same as the IFM + auto returnOp = CreateLUT(ifmConn->tensor, lutTens, ifmConn->quantization, ifmConn->quantization, lutTens->Type(), + &ifmConn->shape, ofmConn->tensor, ifmConn->slice, ofmConn->slice); + returnOp->SetRounding(RoundMode::NATURAL); + return returnOp; +} + +Operation *TFLiteGraphOptimiser::ConvertTanhSigmoidToLUT16(Operation *const op) +{ + auto ifmConn = op->Input(TensorUsage::IFM0); + auto ofmConn = op->Output(TensorUsage::OFM); + auto ifm = ifmConn->tensor; + auto ofm = ofmConn->tensor; + + if ( ifm->Type() != DataType::Int16 || ifm->Type() != ofm->Type() ) + { + return op; + } + + // clang-format off + // Table of sigmoid(i/24)*65536 + static const uint16_t SIGMOID_TABLE[256] = + { + 32768, 33451, 34133, 34813, 35493, 36169, 36843, 37513, + 38180, 38841, 39498, 40149, 40794, 41432, 42064, 42688, + 43304, 43912, 44511, 45102, 45683, 46255, 46817, 47369, + 47911, 48443, 48964, 49475, 49975, 50464, 50942, 51409, + 51865, 52311, 52745, 53169, 53581, 53983, 54374, 54755, + 55125, 55485, 55834, 56174, 56503, 56823, 57133, 57433, + 57724, 58007, 58280, 58544, 58800, 59048, 59288, 59519, + 59743, 59959, 60168, 60370, 60565, 60753, 60935, 61110, + 61279, 61441, 61599, 61750, 61896, 62036, 62172, 62302, + 62428, 62549, 62666, 62778, 62886, 62990, 63090, 63186, + 63279, 63368, 63454, 63536, 63615, 63691, 63765, 63835, + 63903, 63968, 64030, 64090, 64148, 64204, 64257, 64308, + 64357, 64405, 64450, 64494, 64536, 64576, 64614, 64652, + 64687, 64721, 64754, 64786, 64816, 64845, 64873, 64900, + 64926, 64950, 64974, 64997, 65019, 65039, 65060, 65079, + 65097, 65115, 65132, 65149, 65164, 65179, 65194, 65208, + 65221, 65234, 65246, 65258, 65269, 65280, 65291, 65301, + 65310, 65319, 65328, 65337, 65345, 65352, 65360, 65367, + 65374, 65381, 65387, 65393, 65399, 65404, 65410, 65415, + 65420, 65425, 65429, 65433, 65438, 65442, 65445, 65449, + 65453, 65456, 65459, 65462, 65465, 65468, 65471, 65474, + 65476, 65479, 65481, 65483, 65485, 65488, 65489, 65491, + 65493, 65495, 65497, 65498, 65500, 65501, 65503, 65504, + 65505, 65507, 65508, 65509, 65510, 65511, 65512, 65513, + 65514, 65515, 65516, 65517, 65517, 65518, 65519, 65520, + 65520, 65521, 65522, 65522, 65523, 65523, 65524, 65524, + 65525, 65525, 65526, 65526, 65526, 65527, 65527, 65528, + 65528, 65528, 65529, 65529, 65529, 65529, 65530, 65530, + 65530, 65530, 65531, 65531, 65531, 65531, 65531, 65532, + 65532, 65532, 65532, 65532, 65532, 65533, 65533, 65533, + 65533, 65533, 65533, 65533, 65533, 65534, 65534, 65534, + 65534, 65534, 65534, 65534, 65534, 65534, 65534, 65535 + // clang-format on + }; + + auto lut = std::make_unique(512); + for ( int i = -256; i < 256; ++i ) + { + int j0, j1, v0, v1; + if ( i >= 0 ) + { + j0 = i; + j1 = i == 255 ? 255 : i + 1; + v0 = SIGMOID_TABLE[j0] - 0x8000; + v1 = SIGMOID_TABLE[j1] - 0x8000; + } + else + { + j0 = i == -256 ? 255 : -i; + if ( op->Type() == OpType::Sigmoid ) + { + j1 = j0 - 1; + } + else + { + j1 = i == -256 ? 255 : j0 - 1; + } + + v0 = 0x8000 - SIGMOID_TABLE[j0]; + v1 = 0x8000 - SIGMOID_TABLE[j1]; + } + + uint32_t base = v0 & 0xffff; + + uint32_t slope = 0; + if ( v1 - v0 > 0 ) slope = v1 - v0; + + lut[256 + i] = (slope << 16) | (base); + } + + auto lutTens = CreateConstTensor("LUT", ifmConn->tensor->Type(), std::make_shared(std::move(lut), 512)); + op->ConnectInput(TensorUsage::LUT, lutTens); + return op; +} + + +// Rewrite functions + +// Convert EXP operations to LUT +Operation *TFLiteGraphOptimiser::ConvertExpToLUT(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + Operation *returnOp = operation; + OpType type = operation->Type(); + if ( type != OpType::Exp ) + { + return returnOp; + } + const auto &ifmConn = operation->Input(TensorUsage::IFM0); + DataType ifmType = ifmConn->tensor->Type(); + if ( (ifmType & DataType::Bits8) == DataType::Bits8 ) + { + returnOp = ConvertToLUT8( + operation, [](double x) -> double { return std::exp(x); }, "Exp"); + RecordOptimisation(operation, returnOp); + operation->Disconnect(); + } + else if ( ifmType == DataType::Int16 ) + { + returnOp = ConvertToInterpolatingLUT16( + operation, [](double x) -> double { return std::exp(x); }, "Exp16(interp)"); + RecordOptimisation(operation, returnOp); + operation->Disconnect(); + } + return returnOp; +} + +Operation *TFLiteGraphOptimiser::RewriteConcat(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + auto *returnOp = operation; + auto opType = operation->Type(); + + if ( opType == OpType::Concat || opType == OpType::ConcatTFLite || opType == OpType::Pack ) + { + auto *ifm0Conn = operation->Input(TensorUsage::IFM0); + auto *ofmConn = operation->Output(TensorUsage::OFM); + auto *ofm = ofmConn->tensor.get(); + auto axis = GetAxis(operation); + auto axis4D = 0; + + // Remove writers from OFM + ofm->RemoveWriters(); + + // No unfuse of activation, should have been taken care of in TFLite reader. + // Pack treated like concat after setting desired shape + Shape packShape = Shape(); // Pack/Unpack calculates shape once outside loop. + if ( opType == OpType::Pack ) + { + packShape = MakePackUnpackDesiredShape(axis, ifm0Conn->shape, &axis4D); + } + + auto idx = 0; + auto usage = MakeTensorUsage(TensorUsage::IFM, 0); + auto *ifmConn = operation->Input(usage); + auto offset = 0; + // Set shape on all IFMs + while ( ifmConn != nullptr ) + { + Shape writeOffset(0, 0, 0, 0); + if ( opType == OpType::Pack ) + { + ifmConn->shape = packShape; + writeOffset[axis4D] = offset; + } + else if ( opType == OpType::Concat || opType == OpType::ConcatTFLite ) + { + ifmConn->shape = MakeConcatSplitDesiredShape(axis, ifmConn->shape, &axis4D); + writeOffset[axis4D] = offset; + } + + auto op = MakeMemoryCopyForConcat(ofmConn, ifmConn, writeOffset); + + offset += ifmConn->shape[axis4D]; + + ifmConn->tensor->RemoveReader(operation->shared_from_this()); + + usage = MakeTensorUsage(TensorUsage::IFM, ++idx); + ifmConn = operation->Input(usage); + + RecordOptimisation(operation, op.get()); + } + // Replaced by multiple ops. + // Will return the original op, which have all the Input/Outputs for the traversal. + // But with Writers and Readers cleared. + } + return returnOp; +} + + +Operation *TFLiteGraphOptimiser::RewriteSplit(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + auto *returnOp = operation; + auto opType = operation->Type(); + + if ( opType == OpType::Split || opType == OpType::SplitV || opType == OpType::StridedSlice || + opType == OpType::Slice || opType == OpType::Unpack ) + { + auto *ifmConn = operation->Input(TensorUsage::IFM0); + assert(ifmConn); + auto *ofmConn = operation->Output(TensorUsage::OFM); + assert(ofmConn); + auto axis = GetAxis(operation); + auto axis4D = 0; + + if ( opType == OpType::StridedSlice ) + { + // StridedSlice ellipsis_mask not supported. + // StridedSlice new_axis_mask and shrink_axis_mask cannot both be set. + const auto ellipsis_mask = operation->Parameters().strided_slice.ellipsis_mask; + const auto new_axis_mask = operation->Parameters().strided_slice.new_axis_mask; + const auto shrink_axis_mask = operation->Parameters().strided_slice.shrink_axis_mask; + if ( ellipsis_mask != 0 || (new_axis_mask != 0 && shrink_axis_mask != 0) ) + { + return returnOp; + } + } + + // Only rewrite for int8, uint8 and int16 supported. + auto ifmType = ifmConn->tensor->Type(); + if ( ifmType != DataType::Int8 && ifmType != DataType::UInt8 && ifmType != DataType::Int16 ) + { + return returnOp; + } + + // Only rewrite for int8, uint8 and int16 supported. + auto ofmType = ofmConn->tensor->Type(); + if ( ofmType != DataType::Int8 && ofmType != DataType::UInt8 && ofmType != DataType::Int16 ) + { + return returnOp; + } + + Shape unpackShape = Shape(); // Pack/Unpack calculates shape once outside loop. + if ( opType == OpType::Unpack ) + { + unpackShape = MakePackUnpackDesiredShape(axis, ofmConn->shape, &axis4D); + } + + auto idx = 0; + auto offset = 0; + auto usage = MakeTensorUsage(TensorUsage::OFM, 0); + ofmConn = operation->Output(usage); + // Set shape on all OFMs + while ( ofmConn != nullptr ) + { + // Remove writers from OFM + auto *ofm = ofmConn->tensor.get(); + ofm->RemoveWriters(); + + Shape readOffset(0, 0, 0, 0); + Shape readShape(1, 1, 1, 1); + + if ( opType == OpType::Unpack ) + { + ofmConn->shape = unpackShape; + readShape = unpackShape; + readOffset[axis4D] = offset; + } + else if ( opType == OpType::Split || opType == OpType::SplitV ) + { + ofmConn->shape = MakeConcatSplitDesiredShape(axis, ofmConn->shape, &axis4D); + readShape = ofmConn->shape; + readOffset[axis4D] = offset; + } + else if ( opType == OpType::Slice ) + { + ofmConn->shape = Shape::PadAxes(ofmConn->shape, 4, 1); + readShape = ifmConn->shape.WithOnes(); + readOffset = ifmConn->shape.WithZeros(); + SetSliceOffsetValues(operation, readShape, readOffset); + } + else if ( opType == OpType::StridedSlice ) + { + // TODO: MLBEDSW-9071: Change StridedSlice shape to 4D + ofmConn->shape = MakeStridedSliceDesiredShape(operation, ofmConn->shape); + readShape = ifmConn->shape.WithOnes(); + readOffset = ifmConn->shape.WithZeros(); + SetStridedSliceOffsetValues(operation, ifmConn, readShape, readOffset); + } + + auto op = MakeMemoryCopyForSplitOps(ofmConn, ifmConn, readShape, readOffset); + offset += ofmConn->shape[axis4D]; + + usage = MakeTensorUsage(TensorUsage::OFM, ++idx); + ofmConn = operation->Output(usage); + RecordOptimisation(operation, op.get()); + } + // Replaced by multiple ops. + // Will return the original op, which have all the Input/Outputs for the traversal. + // But with Writers and Readers cleared. + ifmConn->tensor->RemoveReader(operation->shared_from_this()); + } + return returnOp; +} + + +Operation *TFLiteGraphOptimiser::RemoveReshape(Graph *const graph, Operation *const operation) +{ + Operation *returnOp = operation; + OpType opType = operation->Type(); + + if ( IsReshape(opType) ) + { + auto *ifmConn = operation->Input(TensorUsage::IFM0); + auto *ofmConn = operation->Output(TensorUsage::OFM); + auto *ifm = ifmConn->tensor.get(); + auto *ofm = ofmConn->tensor.get(); + + // Check if ifm/ofm are network ifm/ofm + bool isIfmSgIfm = IsTensorInVector(graph->Inputs(), ifm); + bool isOfmSgOfm = IsTensorInVector(graph->Outputs(), ofm); + bool isIfmSgOfm = IsTensorInVector(graph->Outputs(), ifm); + + // TODO: MLBEDSW-9069: Check CPU operator producer/consumer + + // Inserts a copy op if needed before removing reshapes. + if ( (isIfmSgIfm || isIfmSgOfm) && (isOfmSgOfm) ) + { + auto copyOp = InsertCopyOpAfterTensor(ifmConn->tensor, ifmConn->quantization); + copyOp->SetRounding(RoundMode::NATURAL); + + // reset the ifm to reflect the reshape's new ifm + ifmConn = operation->Input(TensorUsage::IFM0); + ifm = ifmConn->tensor.get(); + returnOp = copyOp.get(); + RecordOptimisation(operation, returnOp); + // Reshape still needs to be removed. + } + + // Remove the reshape and one of the tensors. + if ( isOfmSgOfm ) + { + // TODO: This path should also be used for ofm tensors consumed by CPU ops. + + // The OFM is in graph outputs, do not remove this tensor. + // Bypass by replacing ifm with ofm. + // Set OFM as output for IFM producers + ReplaceProducerOutput(ifm->Writers(), ifm, ofmConn->tensor); + + // Set OFM as input to other IFM consumers. + ReplaceConsumerInput(operation, ifm->Readers(), ifm, ofmConn->tensor); + } + else + { + // Bypass by replacing ofm with ifm. + // Set IFM as input to OFM consumers. + ReplaceConsumerInput(nullptr, ofm->Readers(), ofm, ifmConn->tensor); + } + // Remove the reshape from ifm readers and ofm writers. + // Note the Inputs/Outputs on operation should still be intact to not break the traversal. + ifm->RemoveReader(operation->shared_from_this()); + ofm->RemoveWriter(operation->shared_from_this()); + } + + return returnOp; +} + +Operation *TFLiteGraphOptimiser::RemoveTranspose(Graph *const graph, Operation *const operation) +{ + Operation *returnOp = operation; + + OpType opType = operation->Type(); + + if ( opType == OpType::Transpose ) + { + auto *ifmConn = operation->Input(TensorUsage::IFM0); + assert(ifmConn); + auto *paramsConn = operation->Input(TensorUsage::Params); + assert(paramsConn); + assert(paramsConn->shape.Size() == 1); + auto *ofmConn = operation->Output(TensorUsage::OFM); + assert(ofmConn); + auto *ifm = ifmConn->tensor.get(); + auto *ofm = ofmConn->tensor.get(); + Shape ifmShape = ifmConn->shape; + Shape ofmShape = ofmConn->shape; + + // We can only handle permutation vectors up 4 elements + if ( paramsConn->shape.Depth() > 4 ) return returnOp; + + // We can only handle constant permutation vectors + if ( !paramsConn->tensor->IsConstant() ) return returnOp; + + // Convert the permutation vector to a transpose mask that can transpose a shape of size 4 + // For example: + // [0, 1, 2, 3] -> 0x0123 ("NHWC") + // [0, 1, 2] -> 0x0123 ("NHWC") + // [0, 1] -> 0x0123 ("NHWC") + // [0] -> 0x0123 ("NHWC") + // [0, 2, 1, 3] -> 0x0213 ("NWHC") + // [1, 0, 2] -> 0x0213 ("NWHC") + uint32_t mask = 0; + int offset = 4 - paramsConn->shape.Depth(); + for ( int i = 0; i < offset; i++ ) + { + mask = (mask << 4) | i; + } + for ( int i = offset; i < 4; i++ ) + { + mask = (mask << 4) | (offset + paramsConn->tensor->View().Values()[i - offset]); + } + + // Convert the transpose mask to a transpose type + TransposeType transposeType = TransposeType(mask); + + // Check if IFM/OFM are network IFM/OFM + bool isIfmSgIfm = IsTensorInVector(graph->Inputs(), ifm); + bool isIfmSgOfm = IsTensorInVector(graph->Outputs(), ifm); + bool ifmSingleReader = ifm->Readers().size() == 1; + bool ifmSingleWriter = ifm->Writers().size() == 1; + + Operation *mainOp = nullptr; + + if ( ifmSingleReader && ifmSingleWriter && !isIfmSgIfm && !isIfmSgOfm ) + { + // Get the previous op and its current transpose type + Operation *prevOp = ifm->Writers()[0].get(); + OpType prevOpType = prevOp->Type(); + auto prevOfmConn = prevOp->Output(TensorUsage::OFM); + + if ( IsNone(prevOfmConn->transpose) && prevOfmConn->reverse == ReverseType::None && + prevOfmConn->shape == ifmShape && _arch->SupportsTranspose(prevOpType, transposeType) ) + { + // Set transpose type and shape on main op's OFM connection + prevOp->Output(TensorUsage::OFM)->shape = Shape::PadAxes(ofmShape, 4, 1); + prevOp->Output(TensorUsage::OFM)->transpose = transposeType; + + // Previous op supports transpose -- Save it so we can fuse transpose to it + mainOp = prevOp; + } + } + + if ( !mainOp && _arch->SupportsTranspose(OpType::MemoryCopy, transposeType) ) + { + // Previous doesn't support transpose -- Add a MemoryCopy so we can fuse transpose to it + auto memoryCopy = InsertCopyOpAfterTensor(ifmConn->tensor, ifmConn->quantization); + memoryCopy->SetRounding(RoundMode::NATURAL); + + // Set transpose type and shapes on main op's IFM/OFM connection + memoryCopy->Input(TensorUsage::IFM0)->shape = Shape::PadAxes(ifmShape, 4, 1); + memoryCopy->Output(TensorUsage::OFM)->shape = Shape::PadAxes(ofmShape, 4, 1); + memoryCopy->Output(TensorUsage::OFM)->transpose = transposeType; + + // Since we added a new op and tensor before our transpose, update to new IFM + ifmConn = operation->Input(TensorUsage::IFM0); + ifm = ifmConn->tensor.get(); + + mainOp = memoryCopy.get(); + } + + if ( mainOp ) + { + // Bypass and remove Transpose + ReplaceProducerOutput(ifm->Writers(), ifm, ofmConn->tensor); + ReplaceConsumerInput(operation, ifm->Readers(), ifm, ofmConn->tensor); + operation->Disconnect(); + + returnOp = mainOp; + } + } + + return returnOp; +} + + +// Remove Reverse op and move its reverse type to the previous op's OFM TensorConnection. If previous op can't reverse, +// insert a MemoryCopy. +Operation *TFLiteGraphOptimiser::RemoveReverse(Graph *const graph, Operation *const operation) +{ + auto returnOp = operation; + + if ( operation->Type() == OpType::ReverseV2 ) + { + auto ifmConn = operation->Input(TensorUsage::IFM); + auto paramsConn = operation->Input(TensorUsage::Params); + auto ofmConn = operation->Output(TensorUsage::OFM); + + auto *ifm = ifmConn->tensor.get(); + auto *ofm = ofmConn->tensor.get(); + Shape ifmShape = ifmConn->shape; + Shape ofmShape = ofmConn->shape; + + // We can only handle constant axis vectors + if ( !paramsConn->tensor->IsConstant() ) return returnOp; + + // We can only handle 1-element axis vectors + if ( paramsConn->shape != Shape(1) ) return returnOp; + + assert(paramsConn->tensor->Type() == DataType::Int32); + int32_t axis = paramsConn->tensor->View().Values()[0]; + + // Convert the axis parameter to a reverse type. + // For example: + // [axis = 0, size = 1, min_axis = 0, max_axis = 0] -> reverse type C (0x1) + // [axis = 0, size = 2, min_axis = 0, max_axis = 1] -> reverse type W (0x2) + // [axis = 1, size = 2, min_axis = 0, max_axis = 1] -> reverse type C (0x1) + // [axis = 0, size = 3, min_axis = 0, max_axis = 2] -> reverse type H (0x4) + // [axis = 1, size = 3, min_axis = 0, max_axis = 2] -> reverse type W (0x2) + // [axis = 2, size = 3, min_axis = 0, max_axis = 2] -> reverse type C (0x1) + // [axis = 1, size = 4, min_axis = 1, max_axis = 3] -> reverse type H (0x4) + // [axis = 2, size = 4, min_axis = 1, max_axis = 3] -> reverse type W (0x2) + // [axis = 3, size = 4, min_axis = 1, max_axis = 3] -> reverse type C (0x1) + const int size = ifmShape.Size(); + if ( axis < 0 ) axis = size + axis; + const int axis_min = std::max(size - 3, 0); // Can only reverse the last 3 dimensions + const int axis_max = size - 1; + if ( axis < axis_min || axis > axis_max ) return returnOp; + const ReverseType reverseType = ReverseType(1 << (axis_max - axis)); + + // Check if IFM/OFM are network IFM/OFM + bool isIfmSgIfm = IsTensorInVector(graph->Inputs(), ifm); + bool isIfmSgOfm = IsTensorInVector(graph->Outputs(), ifm); + bool ifmSingleReader = ifm->Readers().size() == 1; + bool ifmSingleWriter = ifm->Writers().size() == 1; + + Operation *mainOp = nullptr; + + if ( ifmSingleReader && ifmSingleWriter && !isIfmSgIfm && !isIfmSgOfm ) + { + // Get the previous op and its current reverse type + Operation *prevOp = ifm->Writers()[0].get(); + OpType prevOpType = prevOp->Type(); + auto prevOfmConn = prevOp->Output(TensorUsage::OFM); + + if ( prevOfmConn->reverse == ReverseType::None && IsNone(prevOfmConn->transpose) && + prevOfmConn->shape == ifmShape && _arch->SupportsReverse(prevOpType, reverseType) ) + { + // Set reverse type and shape on main op's OFM connection + prevOp->Output(TensorUsage::OFM)->shape = Shape::PadAxes(ofmShape, 4, 1); + prevOp->Output(TensorUsage::OFM)->reverse = reverseType; + + // Previous op supports reverse -- Save it so we can fuse reverse to it + mainOp = prevOp; + } + } + + if ( !mainOp && _arch->SupportsReverse(OpType::MemoryCopy, reverseType) ) + { + // Previous doesn't support reverse -- Add a MemoryCopy so we can fuse reverse to it + auto memoryCopy = InsertCopyOpAfterTensor(ifmConn->tensor, ifmConn->quantization); + memoryCopy->SetRounding(RoundMode::NATURAL); + + // Set reverse type and shapes on main op's IFM/OFM connection + memoryCopy->Input(TensorUsage::IFM0)->shape = Shape::PadAxes(ifmShape, 4, 1); + memoryCopy->Output(TensorUsage::OFM)->shape = Shape::PadAxes(ofmShape, 4, 1); + memoryCopy->Output(TensorUsage::OFM)->reverse = reverseType; + + // Since we added a new op and tensor before our reverse, update to new IFM + ifmConn = operation->Input(TensorUsage::IFM0); + ifm = ifmConn->tensor.get(); + + mainOp = memoryCopy.get(); + } + + if ( mainOp ) + { + // Bypass and remove Reverse + ReplaceProducerOutput(ifm->Writers(), ifm, ofmConn->tensor); + ReplaceConsumerInput(operation, ifm->Readers(), ifm, ofmConn->tensor); + operation->Disconnect(); + + returnOp = mainOp; + } + } + + return returnOp; +} + +// Replace TFLite GatherV2 and GatherNd with GraphIR Gather, if possible. +Operation *TFLiteGraphOptimiser::ConvertGather(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + + Operation *returnOp = operation; + + OpType opType = operation->Type(); + + if ( opType == OpType::GatherV2 && _arch->SupportsGather(OpType::Gather) ) + { + auto *paramsConn = operation->Input(TensorUsage::IFM0); + auto *idxConn = operation->Input(TensorUsage::IFM1); + auto *ofmConn = operation->Output(TensorUsage::OFM); + assert(paramsConn); + assert(idxConn); + assert(ofmConn); + + auto paramsRank = paramsConn->shape.Size(); + auto idxRank = idxConn->shape.Size(); + + // TFLite Gather attributes + int axisParam = 0; + int batchDimsParam = 0; + const tflite::Operator *const passthrough = static_cast(operation->Passthrough()); + if ( passthrough ) + { + const auto options = passthrough->builtin_options_as_GatherOptions(); + if ( options ) + { + axisParam = options->axis(); + if ( axisParam < 0 ) axisParam = paramsRank - (-axisParam); + batchDimsParam = options->batch_dims(); + // TODO: convert below asserts to TFLite semantic checks + assert(axisParam >= 0); + assert(axisParam < paramsRank); + assert(batchDimsParam >= 0); + assert(batchDimsParam < paramsRank); + assert(batchDimsParam < idxRank); + assert(batchDimsParam <= axisParam); + } + } + + // Calculate GraphIR Gather N dim + int N = 1; + for ( int i = 0; i < batchDimsParam; i++ ) + { + N *= paramsConn->shape[i]; + } + + // Calculate GraphIR Gather W dim + int W = 1; + for ( int i = batchDimsParam; i < idxRank; i++ ) + { + W *= idxConn->shape[i]; + } + + // Calculate GraphIR Gather K dim + int K = paramsConn->shape[axisParam]; + + // Calculate GraphIR Gather C dim + int C = 1; + for ( int i = axisParam + 1; i < paramsRank; i++ ) + { + C *= paramsConn->shape[i]; + } + + // Calculate the remaining dims (must be 1) + int S = 1; + for ( int i = batchDimsParam; i < axisParam; i++ ) + { + S *= paramsConn->shape[i]; + } + + if ( S == 1 ) + { + // Rebuild shapes + paramsConn->shape = Shape(1, N, K, C); + paramsConn->tensor->SetName("values"); + idxConn->shape = Shape(1, 1, N, W); + idxConn->tensor->SetName("indices"); + ofmConn->shape = Shape(1, N, W, C); + ofmConn->tensor->SetName("output"); + + if ( idxConn->tensor->Type() == DataType::Int16 ) + { + // Create new op that casts indices to int32 + auto idxCastOp = CreateCastToInt32(idxConn); + + // Use the casted indicies + auto idxCastConn = idxCastOp->Output(TensorUsage::OFM); + idxCastConn->shape = Shape(1, 1, N, W); + idxCastConn->tensor->SetName("indices-int32"); + operation->CopyInput(TensorUsage::IFM1, *idxCastConn); + } + + // Replace TFLite GatherV2 with GraphIR Gather + auto gatherOp = std::make_shared(OpType::Gather); + gatherOp->SetRounding(RoundMode::DBL); + ReplaceOperation(operation, gatherOp.get()); + RecordOptimisation(operation, gatherOp.get()); + + returnOp = gatherOp.get(); + } + } + + return returnOp; +} + +// Replace TFLite ScatterNd with GraphIR Scatter, if possible. +Operation *TFLiteGraphOptimiser::ConvertScatter(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + + Operation *returnOp = operation; + + OpType opType = operation->Type(); + + if ( opType == OpType::ScatterNd && _arch->SupportsScatter(OpType::Scatter) ) + { + auto *idxConn = operation->Input(TensorUsage::IFM0); + auto *updatesConn = operation->Input(TensorUsage::IFM1); + auto *shapeConn = operation->Input(TensorUsage::Params); + auto *ofmConn = operation->Output(TensorUsage::OFM); + assert(idxConn); + assert(updatesConn); + assert(shapeConn); + assert(ofmConn); + + // Can only support this op when last dimension is 1 + if ( idxConn->shape[-1] != 1 ) + { + return returnOp; + } + + // TODO: MLBEDSW-8459: Add supported ops check for TFLite ScatterND + assert(shapeConn->tensor->IsConstant()); + assert(shapeConn->shape.Size() == 1); + + // Calculate GraphIR Scatter N dim + int N = 1; + + // Calculate GraphIR Scatter K dim + int K = shapeConn->tensor->View().Values()[0]; + + // Calculate GraphIR Scatter W dim + int W = 1; + for ( int i = 0; i < idxConn->shape.Size() - 1; i++ ) + { + W *= idxConn->shape[i]; + } + + // Calculate GraphIR Scatter C dim + int C = 1; + for ( int i = 1; i < shapeConn->shape.Depth(); i++ ) + { + C *= shapeConn->tensor->View().Values()[i]; + } + + // Reshape tensors to follow GraphIR Scatter convention + idxConn->shape = Shape(1, 1, N, W); + idxConn->tensor->SetName("indices"); + updatesConn->shape = Shape(1, N, W, C); + updatesConn->tensor->SetName("input"); + ofmConn->shape = Shape(1, N, K, C); + ofmConn->tensor->SetName("values_out"); + + // Generate a constant zeroed tensor as the GraphIR Scatter values_in tensor with same shape as values_out + auto dtype = ofmConn->tensor->Type(); + std::vector zeroVector(DataTypeStorageSizeBytes(dtype, ofmConn->shape.Elements()), 0); + auto zeroBuffer = std::make_shared(std::move(zeroVector)); + auto zeroTensor = CreateConstTensor("values_in", dtype, zeroBuffer, &ofmConn->shape); + + // Add GraphIR Scatter op + auto scatterOp = std::make_shared(OpType::Scatter); + scatterOp->SetRounding(RoundMode::NATURAL); + scatterOp->ConnectInput(TensorUsage::IFM0, zeroTensor); // GraphIR Scatter values_in + scatterOp->CopyInput(TensorUsage::IFM1, *idxConn); // GraphIR Scatter indices + scatterOp->CopyInput(TensorUsage::IFM2, *updatesConn); // GraphIR Scatter input + scatterOp->CopyOutput(TensorUsage::OFM, *ofmConn); // GraphIR Scatter values_out + + // Remove TFLite ScatterNd op + operation->Disconnect(); + RecordOptimisation(operation, scatterOp.get()); + + returnOp = scatterOp.get(); + } + + return returnOp; +} + +// Replace TFLite ResizeBilinear or ResizeNearestNeighbor with Resize +Operation *TFLiteGraphOptimiser::ConvertResize(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + Operation *returnOp = operation; + OpType opType = operation->Type(); + + if ( opType == OpType::ResizeBilinear || opType == OpType::ResizeNearestNeighbor ) + { + auto ifmConn = operation->Input(TensorUsage::IFM); + auto ofmConn = operation->Output(TensorUsage::OFM); + assert(ifmConn); + assert(ofmConn); + + // Get numerators(n) and denominators(d) for the scale fractions + int width_n = ofmConn->shape.Width(); + int width_d = ifmConn->shape.Width(); + int height_n = ofmConn->shape.Height(); + int height_d = ifmConn->shape.Height(); + int heightOffset = 0; + int widthOffset = 0; + + // Compute scaling fractions + // align-corners use a scale-factor of (n-1)/(d-1) + if ( operation->Parameters().resize.alignCorners ) + { + if ( width_d > 1 ) + { + width_n -= 1; + width_d -= 1; + } + if ( height_d > 1 ) + { + height_n -= 1; + height_d -= 1; + } + } + + // reduce scaling fractions with gcd + int gcd_w = std::gcd(width_n, width_d); + width_n = (width_n / gcd_w); + width_d = (width_d / gcd_w); + + int gcd_h = std::gcd(height_n, height_d); + height_n = (height_n / gcd_h); + height_d = (height_d / gcd_h); + + if ( operation->Parameters().resize.halfPixelCenters ) + { + // make sure fractions are evenly divisible by 2 + width_n = width_n * 2; + width_d = width_d * 2; + height_n = height_n * 2; + height_d = height_d * 2; + // adjust offset for half-pixel-centers + widthOffset = (width_d / 2) - (width_n / 2); + heightOffset = (height_d / 2) - (height_n / 2); + } + + // set up op-support query + ResizeSupportQuery query; + query.scaleX = {int16_t(width_n), int16_t(width_d)}; + query.scaleY = {int16_t(height_n), int16_t(height_d)}; + query.offsetX = widthOffset; + query.offsetY = heightOffset; + query.ifmShape = ifmConn->shape; + + if ( opType == OpType::ResizeBilinear ) + { + query.mode = ArchResizeMode::Bilinear; + } + else + { + query.mode = ArchResizeMode::Nearest; + } + if ( _arch->SupportsResize(query) ) + { + // Replace ResizeBilinear or ResizeNearestNeighbor with a Resize op + auto resizeOp = std::make_shared(OpType::Resize); + resizeOp->SetRounding(RoundMode::SYMMETRIC); + resizeOp->CopyInput(TensorUsage::IFM, *ifmConn); + resizeOp->CopyOutput(TensorUsage::OFM, *ofmConn); + resizeOp->Parameters() = operation->Parameters(); + + // write operator attributes + auto &attr = resizeOp->attr; + attr.resize.scaleX = {int16_t(width_n), int16_t(width_d)}; + attr.resize.scaleY = {int16_t(height_n), int16_t(height_d)}; + attr.resize.offsetYX[0] = heightOffset; + attr.resize.offsetYX[1] = widthOffset; + attr.resize.borderYX[0] = 0; + attr.resize.borderYX[1] = 0; + attr.resize.mode = tosa::ResizeMode::NEAREST; + if ( opType == OpType::ResizeBilinear ) + { + attr.resize.mode = tosa::ResizeMode::BILINEAR; + } + + int shift = 0; + if ( opType == OpType::ResizeBilinear && (ifmConn->shape.Width() > 1 || ifmConn->shape.Height() > 1) ) + { + // ResizeBilinear is post-scaled with + // 1 / (height_n * width_n) + // as the scale-factor is a power of two, we can use shift + shift = IntLog2(width_n * height_n); + } + + // Set explicit scaling + Quantization ofmQuant = ofmConn->quantization; + ofmQuant.scales.clear(); + ofmQuant.zeroPoints.clear(); + ofmQuant.scales.emplace_back(QuantizedScale(1, shift)); + ofmQuant.zeroPoints.emplace_back(0); + ofmQuant.type = QuantizationType::EXPLICIT; + resizeOp->Output(TensorUsage::OFM)->Set(ofmQuant); + + RecordOptimisation(operation, resizeOp.get()); + returnOp = resizeOp.get(); + operation->Disconnect(); + } + } + return returnOp; +} + +Operation *TFLiteGraphOptimiser::ConvertArgMax(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + Operation *returnOp = operation; + if ( operation->Type() == OpType::ArgMax && _arch->SupportsArgMax(OpType::ArgMax) ) + { + if ( operation->Input(TensorUsage::IFM0)->slice.shape ) + { + // Already processed, return + return returnOp; + } + constexpr int kMaxSize = 65536; + auto *ifmConn = operation->Input(TensorUsage::IFM0); + auto *paramsConn = operation->Input(TensorUsage::Params); + auto *ofmConn = operation->Output(TensorUsage::OFM); + + auto axis = paramsConn->tensor->View().Values()[0]; + axis = (axis == -1) ? ifmConn->shape.Size() - 1 : axis; + auto axis_4D = axis + (4 - ifmConn->shape.Size()); + + int ifmHeight = ifmConn->shape.Size() >= 3 ? ifmConn->shape.Height() : 1; + int ifmWidth = ifmConn->shape.Size() >= 2 ? ifmConn->shape.Width() : 1; + + if ( axis_4D == 3 ) + { + // ArgMax on Z-dimension, reshape to 1x(H*W)xCx1 + ifmHeight = ifmHeight * ifmWidth; + ifmWidth = ifmConn->shape.Depth() > 0 ? ifmConn->shape.Depth() : 1; + ifmConn->shape = Shape(1, ifmHeight, ifmWidth, 1); + ofmConn->shape = Shape(1, ifmHeight, 1, 1); + axis_4D = 2; + } + operation->attr.axis.axis = axis_4D; + int kernelH = axis_4D == 1 ? ifmHeight : 1; + int kernelW = axis_4D == 2 ? ifmWidth : 1; + std::unique_ptr kernel = std::make_unique(Point2i(kernelW, kernelH), Point2i(1, 1), Point2i(1, 1)); + operation->SetKernel(std::move(kernel)); + ofmConn->quantization.scales.clear(); + ofmConn->quantization.scales.push_back(QuantizedScale(1, 16)); + ofmConn->quantization.zeroPoints.clear(); + ofmConn->quantization.zeroPoints.push_back(0); + operation->SetRounding(RoundMode::TRUNCATE_TO_LOWER); + if ( ifmConn->shape.Size() != ofmConn->shape.Size() ) + { + // One dimension has been removed, reinsert a one in the removed axis dimension + ofmConn->shape.Insert(axis_4D, 1); + } + + if ( ifmHeight > kMaxSize ) + { + // Create splits of kMaxSize until not enough height left + TensorSlice ifmSlice = {Shape(0, 0, 0, 0), Shape(1, kMaxSize, ifmWidth, 1)}; + ifmConn->Set(ifmSlice); + TensorSlice ofmSlice = {Shape(0, 0, 0, 0), Shape(1, kMaxSize, 1, 1)}; + ofmConn->Set(ofmSlice); + + int remainingHeight = ifmHeight - kMaxSize; + int offset = kMaxSize; + while ( remainingHeight > 0 ) + { + int splitHeight = std::min(remainingHeight, kMaxSize); + remainingHeight -= splitHeight; + + auto argmaxSplit = std::make_shared(OpType::ArgMax); + argmaxSplit->ConnectInput(TensorUsage::Params, paramsConn->tensor); + + // Set slices to the size of each split + argmaxSplit->ConnectInput(TensorUsage::IFM0, ifmConn->tensor); + auto *splitIfmConn = argmaxSplit->Input(TensorUsage::IFM0); + ifmSlice = {Shape(0, offset, 0, 0), Shape(1, splitHeight, ifmWidth, 1)}; + splitIfmConn->Set(ifmSlice); + splitIfmConn->shape = Shape(1, ifmHeight, ifmWidth, 1); + + argmaxSplit->ConnectOutput(TensorUsage::OFM, ofmConn->tensor); + auto *splitOfmConn = argmaxSplit->Output(TensorUsage::OFM); + ofmSlice = {Shape(0, offset, 0, 0), Shape(1, splitHeight, 1, 1)}; + splitOfmConn->Set(ofmSlice); + + splitOfmConn->shape = Shape(1, ifmHeight, 1, 1); + splitOfmConn->quantization.scales.push_back(QuantizedScale(1, 16)); + splitOfmConn->quantization.zeroPoints.push_back(0); + argmaxSplit->SetRounding(RoundMode::TRUNCATE_TO_LOWER); + argmaxSplit->attr.axis.axis = axis_4D; + argmaxSplit->SetKernel(std::make_unique(Point2i(kernelW, kernelH), Point2i(1, 1), Point2i(1, 1))); + offset += splitHeight; + RecordOptimisation(operation, argmaxSplit.get()); + } + } + } + return returnOp; +} + +Operation *TFLiteGraphOptimiser::MoveSplitSliceToConsumer(Graph *const, Operation *const operation) +{ + auto *ifmConn = operation->Input(TensorUsage::IFM0); + + if ( operation->Type() == OpType::MemoryCopy && ifmConn->slice.offset.Size() > 0 ) + { + auto *ofmConn = operation->Output(TensorUsage::OFM); + auto *ofm = ofmConn->tensor.get(); + + // TODO: MLBEDSW-9072: Add check that moving split to consumer is valid + + // We can only move to consumer if there is no transpose on the op that we will remove, + // otherwise we will lose that transposition. + if ( ofm->Readers().size() == 1 && IsNone(ofmConn->transpose) ) + { + auto cons = ofm->Readers().front(); + auto consOfmConn = cons->Output(TensorUsage::OFM); + auto *consIfm0 = cons->IFM(0); + auto *consIfm1 = cons->IFM(1); + + bool ifmShapeEqual = false; + if ( consIfm0 == ofm ) + { + // Check if ifm0 consumer has correct shape + auto *consIfm0Conn = cons->Input(TensorUsage::IFM0); + ifmShapeEqual = consIfm0Conn->shape == ofmConn->shape; + } + else if ( consIfm1 != nullptr && consIfm1 == ofm ) + { + // Check if ifm1 consumer has correct shape + auto *consIfm1Conn = cons->Input(TensorUsage::IFM1); + ifmShapeEqual = consIfm1Conn->shape == ofmConn->shape; + } + + // We can only move to consumer if there is no transpose on the op that we move to, + // otherwise the IFM shape may change and transposition will be wrong. + if ( !IsReshape(cons->Type()) && ofmConn->shape == Shape::PadAxes(ofm->StorageShape(), 4, 1) && + IsNone(consOfmConn->transpose) && ifmShapeEqual ) + { + // Split/Slice can be performed by tensor consumer + MoveToConsumer(operation, cons.get()); + } + } + } + + return operation; +} + +Operation *TFLiteGraphOptimiser::CreateTransposeForMatMul(const std::shared_ptr &ifm, const Shape &ofmShape) +{ + auto op = std::make_shared(OpType::Transpose); + + int32_t permutation[] = {0, 1, 3, 2}; + auto buf = std::make_shared(4, std::move(permutation), false); + + // IFM should have the untransposed shape + op->ConnectInput(TensorUsage::IFM, ifm).Set(Shape(1, ofmShape.Height(), ofmShape.Depth(), ofmShape.Width())); + op->ConnectInput(TensorUsage::Params, std::make_shared("perm", DataType::Int32, Shape(4), buf)); + + auto ofm = std::make_shared(ifm->Name() + "/" + OpTypeToString(op->Type()), ifm->Type()); + ofm->SetStorageShape(ofmShape); + + op->ConnectOutput(TensorUsage::OFM, ofm); + return op.get(); +} + +Operation *TFLiteGraphOptimiser::RewriteBatchMatMul(Graph *const, Operation *const operation) +{ + Operation *returnOp = operation; + if ( operation->Type() == OpType::BatchMatMul && _arch->SupportsMatMul(OpType::MatMul) ) + { + const auto ifm = operation->Input(TensorUsage::IFM0); + const auto ifm2 = operation->Input(TensorUsage::IFM1); + const auto ofm = operation->Output(TensorUsage::OFM); + + bool transposeIfm = false; + bool transposeIfm2 = false; + const tflite::Operator *const passthrough = static_cast(operation->Passthrough()); + if ( passthrough ) + { + const auto options = passthrough->builtin_options_as_BatchMatMulOptions(); + if ( options ) + { + // adj_x = True then ifm should be transposed + transposeIfm = options->adj_x(); + // adj_y = False then ifm2 should be transposed + transposeIfm2 = !options->adj_y(); + } + } + + auto ofmShape = Shape::PadAxes(ofm->shape, 4, 1); + auto ifmShape = Shape::PadAxes(ifm->shape, 4, 1); + auto ifm2Shape = Shape::PadAxes(ifm2->shape, 4, 1); + + int n = ofmShape.Batch() * ofmShape.Height(); + + // IFM handling - Reshape ifm N,H,W,C -> 1,NxH,W,C + auto ifmReshaped = Shape(1, n, ifmShape.Width(), ifmShape.Depth()); + auto ifmTensor = ifm->tensor; + if ( transposeIfm ) + { + // Add Transpose op, ifm: 1,n,W,C -> 1,n,C,W + ifmReshaped = Shape(1, ifmReshaped.Height(), ifmReshaped.Depth(), ifmReshaped.Width()); + auto op = CreateTransposeForMatMul(ifm->tensor, ifmReshaped); + RecordOptimisation(operation, op); + ifmTensor = op->Output(TensorUsage::OFM)->tensor; + } + + // IFM2 handling - Reshape ifm2 N,H,W,C -> 1,NxH,W,C + auto ifm2Reshaped = Shape(1, n, ifm2Shape.Width(), ifm2Shape.Depth()); + auto ifm2Tensor = ifm2->tensor; + if ( transposeIfm2 ) + { + // Add Transpose op, ifm2: 1,n,W,C -> 1,n,C,W + ifm2Reshaped = Shape(1, ifm2Reshaped.Height(), ifm2Reshaped.Depth(), ifm2Reshaped.Width()); + auto op = CreateTransposeForMatMul(ifm2->tensor, ifm2Reshaped); + RecordOptimisation(operation, op); + ifm2Tensor = op->Output(TensorUsage::OFM)->tensor; + } + + // OFM handling + // Reshape ofm N,H,W,C -> 1,NxH,W,C + auto ofmReshaped = Shape(1, n, ofmShape.Width(), ofmShape.Depth()); + + // Add n Matmul ops + for ( int i = 0; i < n; ++i ) + { + auto newOp = std::make_shared(OpType::MatMul); + newOp->SetRounding(ifm->tensor->Type() == DataType::Int16 ? RoundMode::NATURAL : RoundMode::DBL); + + TensorSlice ifmSlice = {Shape(0, i, 0, 0), ifmReshaped.WithHeight(1)}; + newOp->ConnectInput(TensorUsage::IFM0, ifmTensor).Set(ifmReshaped).Set(ifm->quantization).Set(ifmSlice); + + TensorSlice ifm2Slice = {Shape(0, i, 0, 0), ifm2Reshaped.WithHeight(1)}; + newOp->ConnectInput(TensorUsage::IFM1, ifm2Tensor).Set(ifm2Reshaped).Set(ifm2->quantization).Set(ifm2Slice); + + + TensorSlice ofmSlice = {Shape(0, i, 0, 0), ofmReshaped.WithHeight(1)}; + newOp->ConnectOutput(TensorUsage::OFM, ofm->tensor).Set(ofmReshaped).Set(ofm->quantization).Set(ofmSlice); + + RecordOptimisation(operation, newOp.get()); + returnOp = newOp.get(); + } + + operation->Disconnect(); + } + return returnOp; +} + + +Operation *TFLiteGraphOptimiser::RewriteFullyConnectDynamic(Graph *const, Operation *const operation) +{ + Operation *returnOp = operation; + auto ifm2 = operation->Input(TensorUsage::Weights); + if ( operation->Type() == OpType::FullyConnected && !ifm2->tensor->IsConstant() && _arch->SupportsMatMul(OpType::MatMul) ) + { + const auto ifm = operation->Input(TensorUsage::IFM0); + const auto ofm = operation->Output(TensorUsage::OFM); + + auto ofmShape = Shape::PadAxes(ofm->shape, 4, 1); + auto ifmShape = Shape::PadAxes(ifm->shape, 4, 1); + auto ifm2Shape = Shape::PadAxes(ifm2->shape, 4, 1); + + auto matMulOp = std::make_shared(OpType::MatMul); + matMulOp->SetRounding(ifm->tensor->Type() == DataType::Int16 ? RoundMode::NATURAL : RoundMode::DBL); + + matMulOp->ConnectInput(TensorUsage::IFM0, ifm->tensor).Set(ifmShape).Set(ifm->quantization).Set(ifm->slice).Set(ifm->transpose); + matMulOp->ConnectInput(TensorUsage::IFM1, ifm2->tensor).Set(ifm2Shape).Set(ifm2->quantization).Set(ifm2->slice).Set(ifm2->transpose); + matMulOp->ConnectOutput(TensorUsage::OFM, ofm->tensor).Set(ofmShape).Set(ofm->quantization).Set(ofm->slice).Set(ofm->transpose); + + RecordOptimisation(operation, matMulOp.get()); + returnOp = matMulOp.get(); + + operation->Disconnect(); + } + return returnOp; +} + + +Operation *TFLiteGraphOptimiser::RewriteSquaredDifference(Graph *const, Operation *const operation) +{ + Operation *returnOp = operation; + if ( operation->Type() == OpType::SquaredDifference ) + { + const auto ifmConn = operation->Input(TensorUsage::IFM0); + const auto ifm2Conn = operation->Input(TensorUsage::IFM1); + const auto ofmConn = operation->Output(TensorUsage::OFM); + + const double ifmScale = ifmConn->quantization.scales[0].Dequantize(); + const double ifm2Scale = ifm2Conn->quantization.scales[0].Dequantize(); + const double ofmScale = ofmConn->quantization.scales[0].Dequantize(); + + auto oneScaleQuant = ifmConn->quantization; + oneScaleQuant.scales[0] = {1, 0}; + oneScaleQuant.zeroPoints.clear(); + + auto noScaleQuant = ifmConn->quantization; + noScaleQuant.scales.clear(); + noScaleQuant.zeroPoints.clear(); + + // All the calculations same as reference kernel + const double twiceMaxInputScale = 2.0 * std::max(ifmScale, ifm2Scale); + const double realInput1Multiplier = ifmScale / twiceMaxInputScale; + const double realInput2Multiplier = ifm2Scale / twiceMaxInputScale; + + int leftShift = ifmConn->tensor->Type() == DataType::Int16 ? 0 : 7; + + double realOutputMultiplier = (twiceMaxInputScale * twiceMaxInputScale) / ((1 << (leftShift * 2)) * ofmScale); + + auto quantizedRealInput1 = QuantizedScale(realInput1Multiplier); + auto quantizedRealInput2 = QuantizedScale(realInput2Multiplier); + auto quantizedRealOutput = QuantizedScale(realOutputMultiplier); + quantizedRealInput1.scale = std::max(quantizedRealInput1.scale, 1); + quantizedRealInput2.scale = std::max(quantizedRealInput2.scale, 1); + quantizedRealOutput.scale = std::max(quantizedRealOutput.scale, 1); + + auto input1MultiplierConst = CreateConstTensor( + ifmConn->tensor->Name() + "_input1_multiplier", quantizedRealInput1.scale); + auto input2MultiplierConst = CreateConstTensor( + ifm2Conn->tensor->Name() + "_input2_multiplier", quantizedRealInput2.scale); + auto outputMultiplierConst = CreateConstTensor( + ofmConn->tensor->Name() + "_output_multiplier", quantizedRealOutput.scale); + + // Convert ifm to 32 bit + auto castOp = CreateCastToInt32(ifmConn); + // Use explicit scaling (multiplier) for the left shift + castOp->Output(TensorUsage::OFM)->quantization.scales.clear(); + castOp->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1 << leftShift, 0)); + castOp->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT; + + // Scale/shift ifm (for 32-bit operations, scale is not applied but shift is) + auto mulOp = CreateMul(castOp->Output(TensorUsage::OFM)->tensor, input1MultiplierConst, noScaleQuant, noScaleQuant, noScaleQuant); + mulOp->SetRounding(RoundMode::DBL); + mulOp->Output(TensorUsage::OFM)->quantization.scales.clear(); + mulOp->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1, quantizedRealInput1.shift)); + mulOp->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT; + auto ifmScaled = mulOp->Output(TensorUsage::OFM); + RecordOptimisation(operation, mulOp); + + // Convert ifm2 to 32 bit + castOp = CreateCastToInt32(ifm2Conn); + // Use explicit scaling (multiplier) for the left shift + castOp->Output(TensorUsage::OFM)->quantization.scales.clear(); + castOp->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1 << leftShift, 0)); + castOp->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT; + RecordOptimisation(operation, castOp); + + // Scale/shift ifm2 (for 32-bit operations, scale is not applied but shift is) + mulOp = CreateMul(castOp->Output(TensorUsage::OFM)->tensor, input2MultiplierConst, noScaleQuant, noScaleQuant, noScaleQuant); + mulOp->SetRounding(RoundMode::DBL); + mulOp->Output(TensorUsage::OFM)->quantization.scales.clear(); + mulOp->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1, quantizedRealInput2.shift)); + mulOp->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT; + auto ifm2Scaled = mulOp->Output(TensorUsage::OFM); + RecordOptimisation(operation, mulOp); + + // Calculate the raw diff + auto subOp = CreateSub(ifmScaled->tensor, ifm2Scaled->tensor, noScaleQuant, noScaleQuant, noScaleQuant); + subOp->SetRounding(RoundMode::DBL); + auto rawDiff = subOp->Output(TensorUsage::OFM); + RecordOptimisation(operation, subOp); + + // Calculate the squared diff + mulOp = CreateMul(rawDiff->tensor, rawDiff->tensor, noScaleQuant, noScaleQuant, noScaleQuant); + mulOp->SetRounding(RoundMode::DBL); + auto squaredRaw = mulOp->Output(TensorUsage::OFM); + RecordOptimisation(operation, mulOp); + + // Scale/shift ofm ((for 32-bit operations, scale is not applied but shift is) + returnOp = CreateMul(squaredRaw->tensor, outputMultiplierConst, noScaleQuant, noScaleQuant, ofmConn->quantization); + returnOp->SetRounding(RoundMode::DBL); + returnOp->ConnectOutput(TensorUsage::OFM, ofmConn->tensor); + returnOp->Output(TensorUsage::OFM)->quantization.scales.clear(); + returnOp->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1, quantizedRealOutput.shift)); + returnOp->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT; + RecordOptimisation(operation, returnOp); + + operation->Disconnect(); + } + return returnOp; +} + + +Operation *TFLiteGraphOptimiser::RewriteSpaceToBatchConvBatchToSpace(Graph *const, Operation *const operation) +{ + auto opType = operation->Type(); + if ( opType == OpType::DepthwiseConv2DBias || opType == OpType::Conv2DBias ) + { + auto prevOp = operation->IFM(0)->Writers().empty() ? nullptr : operation->IFM(0)->Writers().front().get(); + auto nextOp = operation->OFM()->Readers().empty() ? nullptr : operation->OFM()->Readers().front().get(); + if ( prevOp && prevOp->Type() == OpType::SpaceToBatchND && // Previous op is SpaceToBatchND + nextOp && nextOp->Type() == OpType::BatchToSpaceND && // Next op is BatchToSpaceND + operation->IFM(0)->Readers().size() == 1 && // No other consumers of SpaceToBatchND output + operation->OFM()->Readers().size() == 1 // No other consumers of BatchToSpaceND input + ) + { + // Go ahead and short-circuit the SpaceToBatchND and BatchToSpaceND ops + operation->ConnectInput(TensorUsage::IFM0, prevOp->Input(TensorUsage::IFM0)->tensor); + operation->ConnectOutput(TensorUsage::OFM, nextOp->Output(TensorUsage::OFM)->tensor); + // Set new kernel dilation + auto blockShape = prevOp->Input(TensorUsage::Params); + int count = blockShape->shape[0]; + assert(count == operation->IFM(0)->StorageShape().Size() - 2); + assert(blockShape->tensor->IsConstant()); + auto values = blockShape->tensor->View().Values(); + Point2i dilation(values[0], values[count > 1 ? 1 : 0]); + Kernel dilatedKernel = operation->Kernel()->WithDilation(std::move(dilation)); + // Calculate padding for new kernel + Point2i dilatedWH = dilatedKernel.DilatedWH(); + auto &stride = dilatedKernel.Stride(); + auto &inputShape = operation->IFM(0)->StorageShape(); + int xpad = NeededTotalPadding(inputShape.Width(), stride.x, dilatedWH.x); + int ypad = NeededTotalPadding(inputShape.Height(), stride.y, dilatedWH.y); + Margin pad = Margin(ypad / 2, xpad / 2, (ypad + 1) / 2, (xpad + 1) / 2); + // Set the new kernel with updated dilation and padding + operation->SetKernel(std::make_unique(dilatedKernel.WithPadding(pad))); + // Disconnect the SpaceToBatchND and BatchToSpaceND ops + prevOp->Disconnect(); + nextOp->Disconnect(); + } + } + return operation; +} + +// Fixup Conv2DBias and DepthwiseConv2DBias to allow dilation greater than 2. +// TODO: Replace with kernel decomposition for supported architectures +Operation *TFLiteGraphOptimiser::FixupDilationGT2(Graph *const, Operation *const operation) +{ + auto returnOp = operation; + if ( operation->Type() == OpType::Conv2DBias || operation->Type() == OpType::DepthwiseConv2DBias ) + { + auto dilation = operation->Kernel()->Dilation(); + // If dilation in either axis is greater than that supported by hardware then we must manually dilate the kernel + if ( dilation.x > 2 || dilation.y > 2 ) + { + // If the dilation is a multiple of 2 then the hardware dilation can be enabled to provide that multiple + // of 2. This allows the kernel size to be reduced (via the scaled dilation) by half in that dimension. + int hwDilationH = (dilation.y % 2 == 0) ? 2 : 1; + int hwDilationW = (dilation.x % 2 == 0) ? 2 : 1; + int manualDilationH = dilation.y / hwDilationH; + int maunalDilationW = dilation.x / hwDilationW; + + auto *weightConn = operation->Input(TensorUsage::Weights); + assert(weightConn); + assert(weightConn->tensor->IsConstant()); + auto weights = weightConn->tensor->View().Values(); + const auto &weightShape = weightConn->shape; + + // Create new empty kernel with dilated size + auto origKernelSize = operation->Kernel()->Size(); + auto dilatedKernelSize = operation->Kernel()->WithDilation({manualDilationH, maunalDilationW}).DilatedWH(); + Kernel dilatedKernel = operation->Kernel()->WithDilation({hwDilationH, hwDilationW}).WithSize(dilatedKernelSize); + const int newKernelBufferSize = dilatedKernel.ElementsWH() * weightShape.Depth(); + operation->SetKernel(std::make_unique(std::move(dilatedKernel))); + + // Copy the original kernel values into the new sparse kernel + // Width and depth stride same for original and new kernel + auto strideC = 1; + auto strideW = weightShape.Depth(); + auto origStrideH = strideW * origKernelSize.x; + auto newStrideH = strideW * dilatedKernelSize.x; + auto newKernelVals = std::make_unique(newKernelBufferSize); + for ( int h = 0; h < origKernelSize.y; ++h ) + { + for ( int w = 0; w < origKernelSize.x; ++w ) + { + for ( int c = 0; c < weightShape.Depth(); c++ ) + { + auto origKernelIdx = c * strideC + w * strideW + h * origStrideH; + auto newKernelIdx = c * strideC + w * strideW * maunalDilationW + h * newStrideH * manualDilationH; + newKernelVals[newKernelIdx] = weights[origKernelIdx]; + } + } + } + weightConn->tensor->SetBuffer(std::make_shared(std::move(newKernelVals), newKernelBufferSize)); + Shape newShape = weightShape.WithHW(dilatedKernelSize.y, dilatedKernelSize.x); + weightConn->tensor->SetStorageShape(newShape); + weightConn->Set(newShape); + } + } + return returnOp; +} + +// If conv op without bias tensor, create one with zeroes +Operation *TFLiteGraphOptimiser::FixupBias(Graph *const, Operation *const operation) +{ + if ( IsConvolution(operation->Type()) && operation->CountInputs(TensorUsage::Scales) == 0 ) + { + auto ifmConn = operation->Input(TensorUsage::IFM); + auto ofmConn = operation->Output(TensorUsage::OFM); + + // Create bias tensor with zeroes + DataType biasType; + std::shared_ptr biasBuffer; + auto biasElements = ofmConn->shape.Depth(); + if ( ifmConn->tensor->Type() == DataType::Int16 ) + { + biasType = DataType::Int64; + biasBuffer = std::make_shared(std::make_unique(biasElements), biasElements); + } + else + { + biasType = DataType::Int32; + biasBuffer = std::make_shared(std::make_unique(biasElements), biasElements); + } + auto biasTensor = CreateConstTensor("bias", biasType, biasBuffer); + operation->ConnectInput(TensorUsage::Scales, biasTensor); + } + return operation; +} + +// Convert depthwise convolutions with a depth multiplier greater than 1 into a single Conv2D if: +// - the input depth is 1; and +// - the output depth equals the depth multiplier. +Operation *TFLiteGraphOptimiser::RewriteDepthwise(Graph *const, Operation *const operation) +{ + Operation *returnOp = operation; + if ( operation->Type() == OpType::DepthwiseConv2DBias ) + { + const auto ifm = operation->Input(TensorUsage::IFM0); + const auto ofm = operation->Output(TensorUsage::OFM); + const auto multiplier = operation->Kernel()->DepthMultiplier(); + + if ( ifm && (ifm->shape.Depth() == 1) && (multiplier != 1) && ofm && (ofm->shape.Depth() == multiplier) ) + { + auto newOp = std::make_shared(OpType::Conv2DBias); + newOp->SetRounding(ifm->tensor->Type() == DataType::Int16 ? RoundMode::NATURAL : RoundMode::DBL); + auto kernel = std::make_unique(operation->Kernel()->Size(), operation->Kernel()->Stride(), + operation->Kernel()->Dilation(), 1, operation->Kernel()->Padding()); + newOp->SetKernel(std::move(kernel)); + ReplaceOperation(operation, newOp.get()); + returnOp = newOp.get(); + RecordOptimisation(operation, returnOp); + } + } + return returnOp; +} + + +// Check that no reshape like operations remain in graph. +Operation *TFLiteGraphOptimiser::CheckReshapeOpsRemoved(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + OpType opType = operation->Type(); + if ( IsReshape(opType) ) + { + LOG_ERROR("Reshape-like operation type {0} expected to have been removed, still remains.\n", OpTypeToString(opType)); + assert(false); + } + return operation; +} + +Operation *TFLiteGraphOptimiser::ConvertSoftmaxOps(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + return _softmax->ConvertOp(operation); +} + +Operation *TFLiteGraphOptimiser::RewriteFullyConnectedInput(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + if ( operation->Type() == OpType::FullyConnected ) + { + auto weights = operation->Input(TensorUsage::Weights); + assert(weights != nullptr); + auto nInElems = weights->shape.Depth(); + auto ifm = operation->Input(TensorUsage::IFM0); + auto &ifmShape = ifm->slice.shape.IsEmpty() ? ifm->shape : ifm->slice.shape; + auto elms = ifmShape.Elements(); + auto batchSize = elms / nInElems; + assert(batchSize * nInElems == elms); + ifmShape = Shape(batchSize, 1, 1, nInElems); + } + return operation; +} + +// Must be called after RewriteFullyConnectedInput +Operation *TFLiteGraphOptimiser::ConvertBatchedFullyConnected(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + auto returnOp = operation; + if ( operation->Type() == OpType::FullyConnected ) + { + auto ifm = operation->Input(TensorUsage::IFM0); + // Check if the first dimension indicates batching + auto &ifmShape = ifm->slice.shape.IsEmpty() ? ifm->shape : ifm->slice.shape; + int n = ifmShape.Batch(); + if ( n > 1 ) + { + assert(ifmShape.Height() == 1 && ifmShape.Width() == 1); + int h = 1; + int w = n; + // More square H/W gives better performance up to a point + for ( int x = 2; x <= 16 && x * x <= n; ++x ) + { + if ( n % x == 0 ) + { + h = x; + w = n / x; + } + } + ifmShape = Shape(1, h, w, ifmShape.Depth()); + auto ofm = operation->Output(TensorUsage::OFM); + ofm->shape = Shape(1, h, w, ofm->shape.Depth()); + if ( h > 4 || w > 4 ) + { + // Ended up with shape that requires the weights to be reread. + // Convert op to conv2d since this enables weight buffering. + auto newOp = std::make_shared(OpType::Conv2DBias); + newOp->SetRounding(ifm->tensor->Type() == DataType::Int16 ? RoundMode::NATURAL : RoundMode::DBL); + ReplaceOperation(operation, newOp.get()); + returnOp = newOp.get(); + RecordOptimisation(operation, returnOp); + } + } + } + return returnOp; +} + +Operation *TFLiteGraphOptimiser::UnrollConv(Graph *const, Operation *const operation) +{ + auto returnOp = operation; + + if ( operation->Type() == OpType::Conv2D || operation->Type() == OpType::Conv2DBias ) + { + const auto ifmConn = operation->Input(TensorUsage::IFM); + assert(ifmConn); + const auto weightsConn = operation->Input(TensorUsage::Weights); + assert(weightsConn); + const auto scalesConn = operation->Input(TensorUsage::Scales); + assert(scalesConn); + const auto ofmConn = operation->Output(TensorUsage::OFM); + assert(ofmConn); + + const auto kernel = operation->Kernel(); + assert(kernel); + const int32_t kernel_h = kernel->Size().y; + assert(kernel_h > 0); + const int32_t kernel_w = kernel->Size().x; + assert(kernel_w > 0); + const int32_t stride_h = kernel->Stride().y; + assert(stride_h > 0); + const int32_t stride_w = kernel->Stride().x; + assert(stride_w > 0); + const int32_t dilation_h = kernel->Dilation().y; + assert(dilation_h > 0); + const int32_t dilation_w = kernel->Dilation().x; + assert(dilation_w > 0); + const bool hasPadding = !kernel->Padding().IsZero(); + const bool hasIfmSlice = ifmConn->slice.shape.IsValid() || ifmConn->slice.offset.IsValid(); + const bool hasOfmSlice = ofmConn->slice.shape.IsValid() || ofmConn->slice.offset.IsValid(); + + tflite::Padding paddingType = tflite::Padding::VALID; + const tflite::Operator *const passthrough = static_cast(operation->Passthrough()); + if ( passthrough ) + { + const auto options = passthrough->builtin_options_as_Conv2DOptions(); + if ( options ) + { + paddingType = options->padding(); + } + } + + // Figure out if op needs to be unrolled + const bool needUnrollH = stride_h > 3; + const bool needUnrollW = stride_w > 3; + + // Figure out if op can be unrolled + const bool canUnroll = !hasPadding && !hasIfmSlice && !hasOfmSlice && paddingType == tflite::Padding::VALID; + const bool canUnrollH = dilation_h == 1 && canUnroll; + const bool canUnrollW = dilation_w == 1 && canUnroll; + + if ( (needUnrollH || needUnrollW) && canUnrollH && canUnrollW ) + { + const Shape inputGridCell = ifmConn->shape.WithHeight(kernel_h).WithWidth(kernel_w); + const Shape outputGridCell = ofmConn->shape.WithHeight(1).WithWidth(1); + const Point2i gridSize = ofmConn->shape.WH(); + + for ( int h = 0; h < gridSize.y; h++ ) + { + for ( int w = 0; w < gridSize.x; w++ ) + { + TensorSlice ifmSlice; + ifmSlice.shape = inputGridCell; + ifmSlice.offset = Shape(0, h * stride_h, w * stride_w, 0); + + TensorSlice ofmSlice; + ofmSlice.shape = outputGridCell; + ofmSlice.offset = Shape(0, h, w, 0); + + // Add new for this grid cell + auto op = std::make_shared(operation->Type()); + op->SetKernel(std::make_unique(kernel->WithStride({1, 1}))); + op->CopyInput(TensorUsage::IFM, *ifmConn); + op->Input(TensorUsage::IFM)->Set(ifmSlice); + op->CopyInput(TensorUsage::Weights, *weightsConn); + op->CopyInput(TensorUsage::Scales, *scalesConn); + op->CopyOutput(TensorUsage::OFM, *ofmConn); + op->Output(TensorUsage::OFM)->Set(ofmSlice); + RecordOptimisation(operation, op.get()); + + returnOp = op.get(); + } + } + + // Remove original op + operation->Disconnect(); + } + } + + return returnOp; +} + +static bool MeanOpSupported(Operation *const operation, Shape &reduceAxis, Shape &ifmShape4D) +{ + auto ifmConn = operation->Input(TensorUsage::IFM0); + auto ifm = ifmConn->tensor; + auto axis = operation->Input(TensorUsage::Params)->tensor; + auto axisValues = axis->View().Values(); + auto axisCount = axis->StorageShape().IsEmpty() ? 1 : axis->StorageShape().Depth(); + auto ifmDims = ifmShape4D.Size(); + + // Max kernel size + static constexpr int MAX_MEAN_KERNEL_SIZE = 64 * 64; + // Max size to avoid overflow INT32 + static constexpr int MAX_MEAN_ELEMENTS_INT8 = 2 << 23; // 2²⁴ x 2⁷ = 2³¹ + static constexpr int MAX_MEAN_ELEMENTS_UINT8 = 2 << 22; // 2²³ x 2⁸ = 2³¹ + static constexpr int MAX_MEAN_ELEMENTS_INT16 = 2 << 15; // 2¹⁶ x 2¹⁵ = 2³¹ + + bool supported = false; + + // Compute total number of elements + int elements = 1; + for ( int i = 0; i < ifmDims; ++i ) + { + elements *= reduceAxis[i] ? ifmShape4D[i] : 1; + } + + // Make sure overflow can not occur + switch ( ifm->Type() ) + { + case DataType::Int8: + supported = elements <= MAX_MEAN_ELEMENTS_INT8; + break; + + case DataType::UInt8: + supported = elements <= MAX_MEAN_ELEMENTS_UINT8; + break; + + case DataType::Int16: + supported = elements <= MAX_MEAN_ELEMENTS_INT16; + break; + + default: + supported = false; + break; + } + + // Only support batch 1 + supported = supported && (ifmShape4D.Batch() == 1); + + // Reduced axis must be no greater than MAX_MEAN_KERNEL_SIZE + supported = supported && (reduceAxis.Depth() * ifmShape4D.Depth() <= MAX_MEAN_KERNEL_SIZE); + supported = supported && (reduceAxis.Width() * ifmShape4D.Width() <= MAX_MEAN_KERNEL_SIZE); + supported = supported && (reduceAxis.Height() * ifmShape4D.Height() <= MAX_MEAN_KERNEL_SIZE); + + // Depth is supported if any of h,w,c == 1 + if ( supported && reduceAxis.Depth() ) + { + supported = false; + for ( int i = 1; i < 4; i++ ) + { + if ( ifmShape4D[i] == 1 ) + { + supported = true; + break; + } + } + } + return supported; +} + +Operation *TFLiteGraphOptimiser::ConvertMeanOps(Graph *const, Operation *const operation) +{ + auto returnOp = operation; + if ( operation->Type() == OpType::Mean ) + { + auto ifmConn = operation->Input(TensorUsage::IFM0); + auto ofmConn = operation->Output(TensorUsage::OFM); + auto axis = operation->Input(TensorUsage::Params)->tensor; + auto axisValues = axis->View().Values(); + auto axisCount = axis->StorageShape().IsEmpty() ? 1 : axis->StorageShape().Depth(); + auto &ifmShape = ifmConn->shape; + auto &ofmShape = ofmConn->shape; + auto ifmDims = ifmShape.Size(); + auto ofmDims = ofmShape.Size(); + auto &ifmQuant = ifmConn->quantization; + auto &ofmQuant = ofmConn->quantization; + static constexpr int MAX_MEAN_HEIGHT = 64; + static constexpr int MAX_MEAN_KERNEL_SIZE = 64 * 64; + + // Create a 4D shape to indicate which axis that will be reduced + Shape reduceAxis = ifmShape.WithZeros(); + for ( int i = 0; i < axisCount; ++i ) + { + reduceAxis[axisValues[i]] = 1; + } + reduceAxis = Shape::PadAxes(reduceAxis, 4, 0); + + Shape ifmShape4D = Shape::PadAxes(ifmShape, 4, 1); + + // Check if it is possible to convert the MEAN + if ( !MeanOpSupported(operation, reduceAxis, ifmShape4D) ) + { + return operation; + } + + // Fix intermediateShape when keep_dims is false + // e.g. IFM=1xHxWxC axis=2 OFM=1xHxC, the intermediateShape should be 1xHx1xC + Shape intermediateShape = ofmConn->shape; + if ( ofmDims < ifmDims ) + { + for ( int i = 0; i < ifmDims; i++ ) + { + if ( reduceAxis[i] ) + { + intermediateShape = intermediateShape.Insert(i, 1); + } + } + } + intermediateShape = Shape::PadAxes(intermediateShape, 4, 1); + + // Support mean over depth-axis by left-shifting the C channel + // From operator checks we can assume that one of H,W,C has shape 1 + if ( reduceAxis.Depth() && ifmShape4D.Depth() > 1 ) + { + // If W=1 reshape NxHx1xC -> NxHxCx1, else reshape Nx1xWxC -> NxWxCx1 + int idxToDelete = ifmShape.Width() == 1 ? 2 : 1; + + // Delete axis with size 1 + reduceAxis = reduceAxis.Erase(idxToDelete); + ifmShape4D = ifmShape4D.Erase(idxToDelete); + intermediateShape = intermediateShape.Erase(idxToDelete); + + // Add another element to set channel-axis to one + reduceAxis = reduceAxis.Insert(3, 0); + ifmShape4D = ifmShape4D.Insert(3, 1); + intermediateShape = intermediateShape.Insert(3, 1); + } + + // Compute kernel sizes for our convolutions + int h = reduceAxis.Height() ? ifmShape4D.Height() : 1; + int w = reduceAxis.Width() ? ifmShape4D.Width() : 1; + + assert(CheckSafeMul(w, h)); + int num_elements_in_axis = h * w; + + // If one convolution is enough, but height is greater than max kernel height + // reshape from HxW to 1x(HxW) + // This can only be done if the mean is computed over both H and W + if ( h > MAX_MEAN_HEIGHT && num_elements_in_axis <= MAX_MEAN_KERNEL_SIZE && reduceAxis.Height() && reduceAxis.Width() ) + { + ifmShape4D = Shape(ifmShape4D.Batch(), 1, h * w, ifmShape4D.Depth()); + w = h * w; + h = 1; + } + + // When h x w <= 4096 When h x w > 4096 there is a need to split into several ops. + // Do this by splitting up h and change the read_offset/shape. + // Below is an example where ifm is 1x190x64x1 + // MEAN MEAN + // | |-----------------------|----------------------| + // DepthwiseConv2DBias 1_DepthwiseConv2DBias 2_DepthwiseConv2DBias 3_DepthwiseConv2DBias + // | | | | + // MUL |---------ADD-----------| | + // | | + // |----------------ADD---------------| + // | + // MUL + // 1_DepthwiseConv2DBias: read_offset [0, 0, 0, 0]> read_shape [1, 64, 64, 1]> + // 2_DepthwiseConv2DBias: read_offset [0, 64, 0, 0]> read_shape [1, 64, 64, 1]> + // 3_DepthwiseConv2DBias: read_offset [0, 128, 0, 0]> read_shape [1, 62, 64, 1]> + + + int heightPerConv = std::min(MAX_MEAN_KERNEL_SIZE / w, h); + heightPerConv = std::min(heightPerConv, MAX_MEAN_HEIGHT); + int opCount = (h + heightPerConv - 1) / heightPerConv; + Quantization oneScaleQuant = ifmConn->quantization; + oneScaleQuant.scales.clear(); + oneScaleQuant.scales.push_back({1, 0}); + Quantization oneScaleQuantZp0 = oneScaleQuant; + oneScaleQuantZp0.zeroPoints.clear(); + oneScaleQuantZp0.zeroPoints.push_back(0); + + std::shared_ptr accTensor = nullptr; + + // Reuse weight tensor if more ops are needed + std::shared_ptr weightTensor = nullptr; + std::shared_ptr biasTensor = nullptr; + + // set weight quantization + Quantization weightQuant = ifmConn->quantization; + weightQuant.quantMin = {0}; + weightQuant.quantMax = {255}; + weightQuant.scales.clear(); + weightQuant.zeroPoints.clear(); + weightQuant.scales.push_back({1, 0}); + weightQuant.zeroPoints.push_back(0); + + for ( int i = 0; i < opCount; ++i ) + { + bool isLastOp = (i == (opCount - 1)); + + // Compute height for the kernel + int kh = heightPerConv; + if ( isLastOp && h % heightPerConv != 0 ) + { + kh = h % heightPerConv; + // New kernel shape so new weight tensor is needed + weightTensor = nullptr; + biasTensor = nullptr; + } + + // Calculate read and offset shape + int readShapeH = reduceAxis.Height() ? kh : ifmShape4D.Height(); + int readShapeW = reduceAxis.Width() ? w : ifmShape4D.Width(); + + Shape readOffset(0, i * heightPerConv, 0, 0); + Shape readShape = ifmShape4D.WithHW(readShapeH, readShapeW); + + auto op = MakeDepthwiseMeanOp(ifmConn, ifmShape4D, readShape, readOffset, intermediateShape, w, kh, + ofmConn->tensor->Name(), weightTensor, biasTensor, oneScaleQuant, weightQuant, oneScaleQuantZp0); + RecordOptimisation(operation, op); + + if ( i > 0 ) + { + // Add result to accumulator tensor + Quantization accQuant = op->Output(TensorUsage::OFM)->quantization; + op = CreateAdd(accTensor, op->Output(TensorUsage::OFM)->tensor, oneScaleQuantZp0, oneScaleQuantZp0, oneScaleQuantZp0); + op->SetRounding(RoundMode::DBL); + op->Output(TensorUsage::OFM)->quantization.scales.clear(); + op->Output(TensorUsage::OFM)->quantization.scales.push_back(QuantizedScale(1, 0)); + op->Output(TensorUsage::OFM)->quantization.type = QuantizationType::EXPLICIT; + RecordOptimisation(operation, op); + } + accTensor = op->Output(TensorUsage::OFM)->tensor; + } + QuantizedScale quant(ifmQuant.scales[0].Dequantize() / ofmQuant.scales[0].Dequantize()); + + // Convert to left shift-positive notation + auto outputShift = 31 - quant.shift; + + // Below calculation same as in reference to avoid any risk of overflow, + // clamping the shift value at the price of some precision loss. + // IntLog2 same as 63 - CountLeadingZeros(num_elements_in_axis) + int shift = IntLog2(num_elements_in_axis); + shift = std::min(shift, 32); + shift = std::min(shift, 31 + outputShift); + // Multiplier should be 32bit + int32_t outputMultiplier = int32_t((int64_t(quant.scale) << shift) / num_elements_in_axis); + + // Convert to right-shift + outputShift = 31 - (outputShift - shift); + + // For int32 scaling is not supported so instead multiply with the scale + auto scalar = CreateConstTensor(ofmConn->tensor->Name() + "_scalar", outputMultiplier); + auto op = CreateMul(accTensor, scalar, oneScaleQuantZp0, oneScaleQuantZp0, oneScaleQuantZp0); + op->SetRounding(RoundMode::DBL); + + // Apply the shift + QuantizedScale scale(1, outputShift); + Quantization outQuant = ofmConn->quantization; + outQuant.scales.clear(); + outQuant.scales.push_back({1, outputShift}); + outQuant.type = QuantizationType::EXPLICIT; + op->ConnectOutput(TensorUsage::OFM, ofmConn->tensor).Set(intermediateShape).Set(outQuant); + RecordOptimisation(operation, op); + operation->Disconnect(); + returnOp = op; + } + + return returnOp; +} + +// Converts int8/uint8 Sigmoid and Tanh to a LUT based solution +Operation *TFLiteGraphOptimiser::ConvertTanhSigmoidToLUT(Graph *const, Operation *const operation) +{ + auto returnOp = operation; + auto opType = operation->Type(); + auto ifmConn = operation->Input(TensorUsage::IFM0); + auto ifm = ifmConn->tensor.get(); + + if ( ifm->Type() == DataType::Int16 && (opType == OpType::Sigmoid || opType == OpType::Tanh) ) + { + if ( _arch->SupportsSigmoidTanhLutInt16(opType) ) + { + returnOp = ConvertTanhSigmoidToLUT16(operation); + } + } + else if ( opType == OpType::Sigmoid ) + { + returnOp = ConvertToLUT8( + operation, [](double x) -> double { return ClampSigmoid8(x); }, "sigmoid"); + } + else if ( opType == OpType::Tanh ) + { + returnOp = ConvertToLUT8( + operation, [](double x) -> double { return std::tanh(x); }, "tanh"); + } + + + if ( operation != returnOp ) + { + RecordOptimisation(operation, returnOp); + operation->Disconnect(); + } + + return returnOp; +} + + +Operation *TFLiteGraphOptimiser::ConvertPrelu(Graph *const graph, Operation *const operation) +{ + // Lowering of PReLU + // To Minimum + Mul + ReLU + Add + // + // x>0 x <= 0 + // ReLU Minimum(x, 0) + // \ / + // \ Mul(alpha) + // \ / + // Add + // + // ReLU is used for positive input values + // Minimum(x,0) + Mul(alpha) is used for negative input values + // Add sums the two cases + UNUSED(graph); + auto returnOp = operation; + auto opType = operation->Type(); + const auto ifmConn = operation->Input(TensorUsage::IFM0); + const auto params = operation->Input(TensorUsage::Params); + const auto ofmConn = operation->Output(TensorUsage::OFM); + + if ( opType == OpType::Prelu && ifmConn && ofmConn && params ) + { + Quantization ofmQuant = ofmConn->quantization; + Quantization ifmQuant = ifmConn->quantization; + Quantization alphaQuant = params->quantization; + + Quantization noScaleQuant = Quantization::Unit(); + noScaleQuant.scales.clear(); + noScaleQuant.zeroPoints.clear(); + + Quantization unitQuantOfmZp = Quantization::Unit(); + unitQuantOfmZp.zeroPoints.clear(); + unitQuantOfmZp.zeroPoints.push_back(ofmQuant.zeroPoints[0]); + unitQuantOfmZp.type = QuantizationType::EXPLICIT; + + std::shared_ptr zeroTens; + if ( ifmConn->tensor->Type() == DataType::Int16 ) + { + zeroTens = CreateConstTensor("zero_const", int16_t(0)); + } + else + { + zeroTens = CreateConstTensor("zero_const", int8_t(0)); + } + std::shared_ptr fmNegative = ifmConn->tensor->Clone(); + std::shared_ptr fmAlpha = ofmConn->tensor->Clone(); + std::shared_ptr fmScaled = ofmConn->tensor->Clone(); + + // Select values < 0 + auto minOp = std::make_shared(OpType::Minimum); + minOp->CopyInput(TensorUsage::IFM0, *ifmConn); + minOp->ConnectInput(TensorUsage::IFM1, zeroTens).Set(noScaleQuant); + minOp->ConnectOutput(TensorUsage::OFM, fmNegative).Set(ifmConn->quantization); + minOp->SetRounding(RoundMode::DBL); + RecordOptimisation(operation, minOp.get()); + + // and multiply with alpha tensor + auto mulAlpha = std::make_shared(OpType::Mul); + mulAlpha->CopyInput(TensorUsage::IFM0, *minOp->Output(TensorUsage::OFM)); + mulAlpha->CopyInput(TensorUsage::IFM1, *params); + mulAlpha->ConnectOutput(TensorUsage::OFM, fmAlpha).Set(ofmConn->quantization); + mulAlpha->SetRounding(RoundMode::DBL); + RecordOptimisation(operation, mulAlpha.get()); + + // Select (and scale) values > 0 + auto reluOp = std::make_shared(OpType::Relu); + reluOp->CopyInput(TensorUsage::IFM0, *ifmConn); + reluOp->ConnectOutput(TensorUsage::OFM, fmScaled).Set(ofmConn->quantization); + reluOp->Output(TensorUsage::OFM)->quantization.quantMin.push_back(ofmConn->quantization.zeroPoints[0]); + reluOp->SetRounding(RoundMode::DBL); + RecordOptimisation(operation, reluOp.get()); + + // Add scaled and alpha multiplied values + auto addOp = std::make_shared(OpType::Add); + addOp->ConnectInput(TensorUsage::IFM0, fmAlpha).Set(unitQuantOfmZp); + addOp->ConnectInput(TensorUsage::IFM1, fmScaled).Set(unitQuantOfmZp); + addOp->CopyOutput(TensorUsage::OFM, *ofmConn); + addOp->Output(TensorUsage::OFM)->Set(unitQuantOfmZp); + addOp->SetRounding(RoundMode::DBL); + RecordOptimisation(operation, addOp.get()); + returnOp = addOp.get(); + operation->Disconnect(); + } + return returnOp; +} + +// Converts LeakyReLU +// +// alpha == 0 +// converted to ReLU +// alpha == -1 +// converted to Abs +// 8-bit LeakyReLU +// converted to a LUT if unsupported by arch +// 16-bit LeakyReLU: +// alpha > 1 +// Converted to Mul + (Mul) + Min if unsupported by arch +// The extra Mul is needed if ifmQuant != ofmQuant +// alpha <= 1 +// Converted to Mul + (Mul) + Max if unsupported by arch +Operation *TFLiteGraphOptimiser::ConvertLeakyRelu(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + auto returnOp = operation; + auto opType = operation->Type(); + auto ifmConn = operation->Input(TensorUsage::IFM0); + auto ofmConn = operation->Output(TensorUsage::OFM); + + // TODO MLBEDSW-8770: Investigate performance of leakyReLU optimisations + if ( opType == OpType::LeakyRelu && ifmConn != nullptr && ofmConn != nullptr ) + { + float alpha = operation->Parameters().leaky_relu.alpha; + auto ifm = ifmConn->tensor.get(); + auto ofm = ofmConn->tensor.get(); + bool quantized = !IsScalingValidAndEqual(*ifmConn, *ofmConn); + if ( alpha == 0 || std::isinf(1 / alpha) ) + { + // alpha == 0 can be converted to ReLU + auto reluOp = MakeOperation(OpType::Relu, ifmConn, nullptr, ofmConn); + reluOp->Output(TensorUsage::OFM)->quantization.quantMin.push_back(ofmConn->quantization.zeroPoints[0]); + RecordOptimisation(operation, reluOp); + returnOp = reluOp; + } + else if ( alpha == -1 ) + { + // alpha == -1 can be converted to Abs + auto absOp = MakeOperation(OpType::Abs, ifmConn, nullptr, ofmConn); + RecordOptimisation(operation, absOp); + returnOp = absOp; + } + else if ( alpha < 0 || !_arch->SupportsLeakyRelu(quantized, ifm->Type()) ) + { + if ( (ifm->Type() == DataType::Int8 || ifm->Type() == DataType::UInt8) ) + { + // convert to 8-bit LUT + assert(ifm->Type() == ofm->Type()); + returnOp = ConvertToLUT8( + operation, [&alpha](double x) -> double { return x < 0 ? (alpha * x) : x; }, "LeakyReLU"); + RecordOptimisation(operation, returnOp); + } + else + { + // Use 16-bit lowering to Mul + Max or Mul + Min + returnOp = ConvertLeakyRelu16bit(*ifmConn, *ofmConn, operation); + } + } + } + + if ( operation != returnOp ) + { + operation->Disconnect(); + } + + return returnOp; +} + +// Converts RSqrt to a LUT based solution. +Operation *TFLiteGraphOptimiser::ConvertRSqrtToLUT(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + auto returnOp = operation; + auto opType = operation->Type(); + auto ifmConn = operation->Input(TensorUsage::IFM0); + auto ofmConn = operation->Output(TensorUsage::OFM); + + // LUT has been generated by printing the output from the reference. + // clang-format off + static const int32_t kRSqrtLut[] = + { + 0x00000000, 0x00100000, 0x000b504e, 0x00093cd4, 0x00080000, 0x000727c9, 0x0006882f, 0x00060c24, + 0x0005a827, 0x00055555, 0x00050f45, 0x0004d2fe, 0x00049e6a, 0x00047007, 0x000446b4, 0x00042195, + 0x00040000, 0x0003e16d, 0x0003c570, 0x0003abb0, 0x000393e5, 0x00037dd2, 0x00036945, 0x00035613, + 0x00034418, 0x00033333, 0x0003234b, 0x00031447, 0x00030612, 0x0002f89c, 0x0002ebd3, 0x0002dfaa, + 0x0002d414, 0x0002c906, 0x0002be75, 0x0002b45a, 0x0002aaab, 0x0002a161, 0x00029875, 0x00028fe3, + 0x000287a2, 0x00027fb0, 0x00027807, 0x000270a2, 0x0002697f, 0x00026298, 0x00025bec, 0x00025577, + 0x00024f35, 0x00024925, 0x00024343, 0x00023d8e, 0x00023803, 0x000232a1, 0x00022d65, 0x0002284e, + 0x0002235a, 0x00021e87, 0x000219d5, 0x00021541, 0x000210cb, 0x00020c70, 0x00020831, 0x0002040c, + 0x00020000, 0x0001fc0c, 0x0001f82f, 0x0001f468, 0x0001f0b7, 0x0001ed1a, 0x0001e991, 0x0001e61b, + 0x0001e2b8, 0x0001df67, 0x0001dc26, 0x0001d8f7, 0x0001d5d8, 0x0001d2c8, 0x0001cfc8, 0x0001ccd6, + 0x0001c9f2, 0x0001c71c, 0x0001c454, 0x0001c198, 0x0001bee9, 0x0001bc46, 0x0001b9af, 0x0001b723, + 0x0001b4a3, 0x0001b22d, 0x0001afc2, 0x0001ad61, 0x0001ab0a, 0x0001a8bc, 0x0001a678, 0x0001a43e, + 0x0001a20c, 0x00019fe3, 0x00019dc2, 0x00019baa, 0x0001999a, 0x00019791, 0x00019590, 0x00019397, + 0x000191a5, 0x00018fbb, 0x00018dd7, 0x00018bfa, 0x00018a23, 0x00018853, 0x0001868a, 0x000184c6, + 0x00018309, 0x00018152, 0x00017fa0, 0x00017df4, 0x00017c4e, 0x00017aad, 0x00017911, 0x0001777b, + 0x000175e9, 0x0001745d, 0x000172d6, 0x00017153, 0x00016fd5, 0x00016e5b, 0x00016ce7, 0x00016b76, + 0x00016a0a, 0x000168a2, 0x0001673e, 0x000165de, 0x00016483, 0x0001632b, 0x000161d7, 0x00016087, + 0x00015f3b, 0x00015df2, 0x00015cad, 0x00015b6b, 0x00015a2d, 0x000158f2, 0x000157bb, 0x00015686, + 0x00015555, 0x00015427, 0x000152fd, 0x000151d5, 0x000150b0, 0x00014f8f, 0x00014e70, 0x00014d54, + 0x00014c3b, 0x00014b24, 0x00014a11, 0x00014900, 0x000147f1, 0x000146e5, 0x000145dc, 0x000144d5, + 0x000143d1, 0x000142cf, 0x000141d0, 0x000140d3, 0x00013fd8, 0x00013ee0, 0x00013de9, 0x00013cf5, + 0x00013c03, 0x00013b14, 0x00013a26, 0x0001393b, 0x00013851, 0x0001376a, 0x00013684, 0x000135a1, + 0x000134bf, 0x000133e0, 0x00013302, 0x00013226, 0x0001314c, 0x00013074, 0x00012f9e, 0x00012ec9, + 0x00012df6, 0x00012d25, 0x00012c55, 0x00012b87, 0x00012abb, 0x000129f1, 0x00012928, 0x00012860, + 0x0001279a, 0x000126d6, 0x00012613, 0x00012552, 0x00012492, 0x000123d4, 0x00012317, 0x0001225c, + 0x000121a2, 0x000120e9, 0x00012032, 0x00011f7c, 0x00011ec7, 0x00011e14, 0x00011d62, 0x00011cb1, + 0x00011c02, 0x00011b54, 0x00011aa7, 0x000119fb, 0x00011950, 0x000118a7, 0x000117ff, 0x00011758, + 0x000116b3, 0x0001160e, 0x0001156b, 0x000114c8, 0x00011427, 0x00011387, 0x000112e8, 0x0001124a, + 0x000111ad, 0x00011111, 0x00011076, 0x00010fdc, 0x00010f44, 0x00010eac, 0x00010e15, 0x00010d7f, + 0x00010cea, 0x00010c56, 0x00010bc4, 0x00010b32, 0x00010aa0, 0x00010a10, 0x00010981, 0x000108f3, + 0x00010865, 0x000107d9, 0x0001074d, 0x000106c2, 0x00010638, 0x000105af, 0x00010527, 0x0001049f, + 0x00010419, 0x00010393, 0x0001030e, 0x0001028a, 0x00010206, 0x00010183, 0x00010102, 0x00010080 + }; + // clang-format on + + if ( opType == OpType::Rsqrt && ifmConn->tensor->Type() == DataType::Int8 && ofmConn->tensor->Type() == DataType::Int8 ) + { + const int kShift = 20; + const int qMin = -128; + const int qMax = 127; + const auto zpIn = ifmConn->quantization.zeroPoints[0]; + const auto zpOut = ofmConn->quantization.zeroPoints[0]; + const auto ifmScale = ifmConn->quantization.scales[0].Dequantize(); + const auto ofmScale = ofmConn->quantization.scales[0].Dequantize(); + double scale = 1.0 / double(std::sqrt(float(ifmScale)) * float(ofmScale)); + QuantizedScale qScale = QuantizedScale(scale); + // convert to left shift-positive notation + qScale.shift = 31 - qScale.shift - kShift; + + std::vector lut; + lut.reserve(256); + lut.push_back(qMax); + for ( int x = qMin + 1; x <= qMax; ++x ) + { + int index = std::max(0, x - int(zpIn)); + auto value = zpOut + MultiplyByQuantizedMultiplier(kRSqrtLut[index], qScale); + lut.push_back(uint8_t(std::min(qMax, std::max(qMin, int(value))))); + } + + auto lutTens = CreateConstTensor("rsqrt", ifmConn->tensor->Type(), std::make_shared(std::move(lut))); + returnOp = CreateLUT(ifmConn->tensor, lutTens, ifmConn->quantization, ifmConn->quantization, lutTens->Type(), + &ifmConn->shape, ofmConn->tensor, ifmConn->slice, ofmConn->slice); + returnOp->SetRounding(RoundMode::NATURAL); + } + else if ( opType == OpType::Rsqrt && ifmConn->tensor->Type() == DataType::Int16 && ofmConn->tensor->Type() == DataType::Int16 ) + { + const auto ofmScale = operation->Output(TensorUsage::OFM)->quantization.scales[0].Dequantize(); + returnOp = ConvertToInterpolatingLUT16( + operation, + [&ofmScale](double x) -> double + { + if ( x <= 0.0f ) + { + return IntegerMax(DataType::Int16) * ofmScale; + } + else + { + return 1 / std::sqrt(x); + } + }, + "Rsqrt16(interp)"); + } + + if ( operation != returnOp ) + { + RecordOptimisation(operation, returnOp); + operation->Disconnect(); + } + + return returnOp; +} + +// Based on explicit padding provided in a PAD operation, returns adjusted value for +// padAfter that provides equivalent results when used with explicit padding +int TFLiteGraphOptimiser::CalcPadAfter(int inputSize, int stride, int filterSize, int padBefore, int padAfter) +{ + int totalPadding = NeededTotalPadding(inputSize, stride, filterSize); + // The bottom/right padding might need downward adjustment depending on stride/input size + int remainderDiff = padAfter % stride - (totalPadding - padBefore) % stride; + return std::max(0, padAfter - remainderDiff - (remainderDiff >= 0 ? 0 : stride)); +} + +// Tries to completely remove a PAD operator by using explicit padding. +// E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3 +// is rewritten such that the PAD is removed, and the CONV uses explicit padding. +// Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV +// This is the most efficient way to implement PAD, but cannot be done for all pad sizes. +Operation *TFLiteGraphOptimiser::ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + if ( IsConvolution(operation->Type()) && operation->Kernel()->Padding().IsZero() && operation->IFM(0)->Writers().size() == 1 ) + { + // Potential for future optimization: in certain cases also Pad+AvgPool can be handled + // by changing to Depthwise. + auto padOp = operation->IFM(0)->Writers()[0].get(); + if ( padOp->Type() != OpType::Pad ) + { + return operation; + } + auto padIfmConn = padOp->Input(TensorUsage::IFM0); + auto padOfmConn = padOp->Output(TensorUsage::OFM); + const auto &padIfm = padOp->IFM(0); + const auto &padOfm = padOp->OFM(); + + if ( padIfm->Type() != padOfm->Type() || !IsScalingValidAndEqual(*padIfmConn, *padOfmConn) ) + { + return operation; + } + const auto padValues = padOp->Input(MakeTensorUsage(TensorUsage::Params, 0))->tensor->View().Values(); + int top = padValues[2]; + int bottom = padValues[3]; + int left = padValues[4]; + int right = padValues[5]; + const auto &k = operation->Kernel(); + const auto &kwh = k->DilatedWH(); + if ( left + right >= kwh.x || top + bottom >= kwh.y ) + { + // Too much padding + return operation; + } + const auto &ifmShape = padOp->Input(TensorUsage::IFM0)->shape; + int bottomPad = CalcPadAfter(ifmShape.Height(), k->Stride().y, kwh.y, top, bottom); + int rightPad = CalcPadAfter(ifmShape.Width(), k->Stride().x, kwh.x, left, right); + // Adjust the padding attributes of the convolution operator + auto kernel = std::make_unique( + Kernel(k->Size(), k->Stride(), k->Dilation(), k->DepthMultiplier(), Margin(top, left, bottomPad, rightPad))); + operation->SetKernel(std::move(kernel)); + operation->CopyInput(TensorUsage::IFM0, *(padOp->Input(TensorUsage::IFM0))); + if ( padOfm->Readers().empty() ) + { + // Bypass the PAD operator + padOp->Disconnect(); + } + } + return operation; +} + +void TFLiteGraphOptimiser::MakeMemoryCopyForPad( + const char *name, const Operation *operation, TensorConnection *ofmConn, const Shape &shape, const Shape &offset) +{ + auto dtype = ofmConn->tensor->Type(); + std::vector zeroBuf(DataTypeStorageSizeBytes(dtype, shape.Elements())); + std::fill(zeroBuf.begin(), zeroBuf.end(), uint8_t(ofmConn->quantization.zeroPoints[0])); + + auto zeroTens = CreateConstTensor(ofmConn->tensor->Name() + "/" + name, dtype, std::make_shared(std::move(zeroBuf)), &shape); + auto op = std::make_shared(OpType::MemoryCopy); + op->SetRounding(RoundMode::NATURAL); + + op->ConnectInput(TensorUsage::IFM0, zeroTens).Set(ofmConn->quantization); + op->ConnectOutput(TensorUsage::OFM, ofmConn->tensor).Set(ofmConn->shape).Set(ofmConn->quantization).Set({offset, shape}); + RecordOptimisation(operation, op.get()); +} + +// Rewrites PAD operator to a MemoryCopy that copies the IFM to the OFM +// + up to 4 MemoryCopy operators that fill the OFM with zeros at the borders. +// This is done as fall-back for the PAD operators that remain after ReplacePadByExplicitPadding +Operation *TFLiteGraphOptimiser::ConvertPad(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + if ( operation->Type() != OpType::Pad ) + { + return operation; + } + const auto &ifmConn = operation->Input(TensorUsage::IFM0); + const auto &ifmShape = ifmConn->shape; + const auto &ofmConn = operation->Output(TensorUsage::OFM); + const auto &ofmShape = ofmConn->shape; + const auto ¶msConn = operation->Input(TensorUsage::Params); + const auto padValues = paramsConn->tensor->View().Values(); + int top = padValues[2]; + int bottom = padValues[3]; + int left = padValues[4]; + int right = padValues[5]; + int near = padValues[6]; + int far = padValues[7]; + + // Create MemoryCopy op that copies IFM to the right place inside the OFM + Shape shp0 = ofmShape.WithZeros(); + auto mainOp = MakeMemoryCopyForConcat(ofmConn, ifmConn, shp0.WithHeight(top).WithWidth(left).WithDepth(near)); + RecordOptimisation(operation, mainOp.get()); + // Add operations that fill the borders of the OFM + if ( top > 0 ) + { + Shape shape = ofmShape.WithHeight(top); + MakeMemoryCopyForPad("top", operation, ofmConn, shape, shp0); + } + if ( bottom > 0 ) + { + Shape shape = ofmShape.WithHeight(bottom); + Shape offset = shp0.WithHeight(ofmShape.Height() - bottom); + MakeMemoryCopyForPad("bottom", operation, ofmConn, shape, offset); + } + if ( left > 0 ) + { + Shape shape = ifmShape.WithWidth(left).WithDepth(ofmShape.Depth()); + Shape offset = shp0.WithHeight(top); + MakeMemoryCopyForPad("left", operation, ofmConn, shape, offset); + } + if ( right > 0 ) + { + Shape shape = ifmShape.WithWidth(right).WithDepth(ofmShape.Depth()); + Shape offset = shp0.WithHeight(top).WithWidth(ofmShape.Width() - right); + MakeMemoryCopyForPad("right", operation, ofmConn, shape, offset); + } + if ( near > 0 ) + { + Shape shape = ifmShape.WithDepth(near); + Shape offset = shp0.WithHeight(top).WithWidth(left); + MakeMemoryCopyForPad("near", operation, ofmConn, shape, offset); + } + if ( far > 0 ) + { + Shape shape = ifmShape.WithDepth(far); + Shape offset = shp0.WithHeight(top).WithWidth(left).WithDepth(ofmShape.Depth() - far); + MakeMemoryCopyForPad("far", operation, ofmConn, shape, offset); + } + operation->Disconnect(); + return mainOp.get(); +} + +TFLiteGraphOptimiser::TFLiteGraphOptimiser(Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db) : + GraphOptimiser(arch, options, db) +{ + _softmax = std::make_unique(arch, db); +} + +void TFLiteGraphOptimiser::OptimiseGraph(Graph *graph) +{ + for ( auto iOpt = GraphOptimisationSteps().begin(); iOpt != GraphOptimisationSteps().end(); ++iOpt ) + { + LOG_TRACE1("GraphOptimiser {0}/{1}\n", std::distance(GraphOptimisationSteps().begin(), iOpt) + 1, + GraphOptimisationSteps().size()); + // Check if function lists are empty. Do not call for step that only contain disabled debug functions. + if ( !iOpt->opFunction.empty() || !iOpt->tensorFunction.empty() ) + { + RewriteGraph(graph, *iOpt); + } + } +} + +} // namespace regor diff --git a/ethosu/regor/compiler/tflite_graph_optimiser.hpp b/ethosu/regor/compiler/tflite_graph_optimiser.hpp new file mode 100644 index 00000000..4842b0e0 --- /dev/null +++ b/ethosu/regor/compiler/tflite_graph_optimiser.hpp @@ -0,0 +1,312 @@ +// +// SPDX-FileCopyrightText: Copyright 2021-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "common/logging.hpp" + +#include "architecture/architecture.hpp" +#include "common/scaling.hpp" +#include "graph.hpp" +#include "graph_optimiser.hpp" +#include "op_type.hpp" +#include "operation.hpp" +#include "softmax.hpp" +#include "tensor.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace regor +{ +// Sigmoid clamp (-8, 8) +static double ClampSigmoid8(double value) +{ + return ClampSigmoid(value, 8.0); +}; + +/// +/// TFLite Graph optimiser +/// +class TFLiteGraphOptimiser : public GraphOptimiser +{ + using OpRewriteFunction = Operation *(TFLiteGraphOptimiser::*)(Graph *, Operation *); + using TensorRewriteFunction = Tensor *(TFLiteGraphOptimiser::*)(Graph *, Tensor *); + using GraphOptStepArray = std::vector>; + +private: + std::unique_ptr _softmax; + + // utility functions + + // Is the scaling of tensor a and b valid and equal. + bool IsScalingValidAndEqual(const TensorConnection &a, const TensorConnection &b); + // Multiplies int with QuantizedScale with rounding. + int MultiplyByQuantizedMultiplier(int x, QuantizedScale quantScale); + + Operation *MakeMulWithConstTensor(const std::string &name, const TensorConnection &ifmConn, + const TensorConnection &ofmConn, const std::shared_ptr &constTens, const Quantization &quantization); + + // Helper function for converting operations + Operation *MakeOperation(OpType opType, const TensorConnection *ifm0Conn, const TensorConnection *ifm1Conn, const TensorConnection *ofmConn); + + // Converts 16-bit Leaky ReLU + Operation *ConvertLeakyRelu16bit(TensorConnection &ifmConn, TensorConnection &ofmConn, Operation *operation); + + // Get axis parameter for operator + int GetAxis(const Operation *const operation); + // Calculate the read shape and offset values for Slice. + void SetSliceOffsetValues(Operation *const operation, Shape &readShape, Shape &readOffset); + // Calculate the read shape and offset values for StridedSlice. + void SetStridedSliceOffsetValues(Operation *const operation, const TensorConnection *const ifmConn, Shape &readShape, Shape &readOffset); + // Creates MemoryCopy operation for the given ifm/ofm and write offset. + std::shared_ptr MakeMemoryCopyForConcat( + const TensorConnection *const ofmConn, const TensorConnection *const ifmConn, const Shape &writeOffset); + // Creates a MemoryCopy operation for the given ifm/ofm and readOffset. + std::shared_ptr MakeMemoryCopyForSplitOps(const TensorConnection *const ofmConn, + const TensorConnection *const ifmConn, const Shape &readShape, const Shape &readOffset); + + // Creates the desired shape of either: + // - Concat (Input shape - supply IFM base shape) + // - Split/SplitV (Output shape - supply OFM base shape) + // + // returns the Desired shape. + // Also calculates the axis4D, returned through supplied pointer. + Shape MakeConcatSplitDesiredShape(int axis, const Shape &baseShape, int *const axis4D); + + // Creates the desired shape of either: + // - pack (Input shape - supply IFM base shape) + // - unpack (Output shape - supply OFM base shape) + // + // returns the Desired shape. + // Unpack keeps the unpacked dimension set to 1. + // Also calculates the axis4D, returned through supplied pointer. + Shape MakePackUnpackDesiredShape(int axis, const Shape &baseShape, int *const axis4D); + + // Creates the desired Output shape of StridedSlice. + // + // returns the Desired shape. + Shape MakeStridedSliceDesiredShape(Operation *const operation, const Shape &baseShape); + // Move Split/slice op to consumer + void MoveToConsumer(const Operation *const operation, Operation *const cons); + + static void ReplaceOperation(Operation *const operationToReplace, Operation *const newOperation); + + Operation *MakeDepthwiseMeanOp(const TensorConnection *ifmConn, const Shape &ifmShape4D, const Shape &readShape, + const Shape &readOffset, const Shape &ofmShape4D, int w, int h, const std::string &name, std::shared_ptr &weightTensor, + std::shared_ptr biasTensor, const Quantization &ifmQuant, const Quantization &weightQuant, const Quantization &ofmQuant); + + // Converts op to int8/uint8 LUT which is generated with the given function. + Operation *ConvertToLUT8(Operation *op, std::function func, const std::string &name); + + // Converts op to int16 interpolating LUT which is generated with the given function. + Operation *ConvertToInterpolatingLUT16(Operation *op, std::function func, const std::string &name); + + // Converts int16 Tanh/Sigmoid to LUT16 + Operation *ConvertTanhSigmoidToLUT16(Operation *const op); + + + // Rewrite functions + Operation *ConvertExpToLUT(Graph *const graph, Operation *const operation); + Operation *RewriteConcat(Graph *const graph, Operation *const operation); + Operation *RewriteSplit(Graph *const graph, Operation *const operation); + Operation *RemoveReshape(Graph *const graph, Operation *const operation); + Operation *ExtractTransposePermutation(Graph *const graph, Operation *const operation); + Operation *RemoveTranspose(Graph *const graph, Operation *const operation); + Operation *RemoveReverse(Graph *const graph, Operation *const operation); + Operation *ConvertGather(Graph *const graph, Operation *const operation); + Operation *ConvertScatter(Graph *const graph, Operation *const operation); + Operation *ConvertResize(Graph *const graph, Operation *const operation); + Operation *ConvertArgMax(Graph *const graph, Operation *const operation); + Operation *MoveSplitSliceToConsumer(Graph *const, Operation *const operation); + + // RewriteBatchMatMul must be called before rewrite of transpose + Operation *CreateTransposeForMatMul(const std::shared_ptr &ifm, const Shape &ofmShape); + Operation *RewriteBatchMatMul(Graph *const, Operation *const operation); + Operation *RewriteSpaceToBatchConvBatchToSpace(Graph *const, Operation *const operation); + Operation *FixupDilationGT2(Graph *const, Operation *const operation); + Operation *FixupBias(Graph *const, Operation *const operation); + + // Rewrite FullyConnect with dynamic weights to MatMul + Operation *RewriteFullyConnectDynamic(Graph *const, Operation *const operation); + + Operation *CreateCastToInt32(const TensorConnection *ifmConn); + Operation *RewriteSquaredDifference(Graph *const, Operation *const operation); + + // Convert depthwise convolutions with a depth multiplier greater than 1 into a single Conv2D if: + // - the input depth is 1; and + // - the output depth equals the depth multiplier. + Operation *RewriteDepthwise(Graph *const, Operation *const operation); + + // Check that no reshape like operations remain in graph. + Operation *CheckReshapeOpsRemoved(Graph *const graph, Operation *const operation); + + Operation *ConvertSoftmaxOps(Graph *const graph, Operation *const operation); + Operation *RewriteFullyConnectedInput(Graph *const graph, Operation *const operation); + + // Unroll convolution if stride is too great + Operation *UnrollConv(Graph *const, Operation *const operation); + + // Must be called after RewriteFullyConnectedInput + Operation *ConvertBatchedFullyConnected(Graph *const graph, Operation *const operation); + Operation *ConvertMeanOps(Graph *const, Operation *const operation); + + // Converts int8/uint8 Sigmoid and Tanh to a LUT based solution + Operation *ConvertTanhSigmoidToLUT(Graph *const, Operation *const operation); + + // Convert PReLU to (ReLU + Minimum + Mul + Add) + Operation *ConvertPrelu(Graph *const graph, Operation *const operation); + + // Converts Leaky ReLU when needed (LUT based solution or mul + max). + Operation *ConvertLeakyRelu(Graph *const graph, Operation *const operation); + + // Converts HardSwish to a LUT based solution. + Operation *ConvertHardSwishToLUT(Graph *const graph, Operation *const operation); + + // Converts RSqrt to a LUT based solution. + Operation *ConvertRSqrtToLUT(Graph *const graph, Operation *const operation); + + // Based on explicit padding provided in a PAD operation, returns adjusted value for + // padAfter that provides equivalent results when used with explicit padding + int CalcPadAfter(int inputSize, int stride, int filterSize, int padBefore, int padAfter); + + // Tries to completely remove a PAD operator by using explicit padding. + // E.g. a PAD operation that pads 1, followed by a CONV with VALID padding and kernel size 3 + // is rewritten such that the PAD is removed, and the CONV uses explicit padding. + // Converts tens1 -> PAD -> tens2 -> CONV to tens1 -> CONV + // This is the most efficient way to implement PAD, but cannot be done for all pad sizes. + Operation *ReplacePadByExplicitPadding(Graph *const graph, Operation *const operation); + + void MakeMemoryCopyForPad(const char *name, const Operation *operation, TensorConnection *ofmConn, + const Shape &shape, const Shape &offset); + + // Rewrites PAD operator to a MemoryCopy that copies the IFM to the OFM + // + up to 4 MemoryCopy operators that fill the OFM with zeros at the borders. + // This is done as fall-back for the PAD operators that remain after ReplacePadByExplicitPadding + Operation *ConvertPad(Graph *const graph, Operation *const operation); + +public: + // The graph optimisation steps. + // Order matters, array of rewrites processed in order. + // clang-format off + const GraphOptStepArray _graphOptimisationSteps = + {{ + { + { +#if LOG_TRACE1_ON + &GraphOptimiser::VisitTensorLog +#endif + }, + { +#if LOG_TRACE1_ON + &GraphOptimiser::VisitOperatorLog, +#endif + } + }, + { + {}, + { + &TFLiteGraphOptimiser::RewriteConcat + } + }, + { + {}, + { + &TFLiteGraphOptimiser::RewriteSplit + } + }, + { + {}, + { + &TFLiteGraphOptimiser::RewriteBatchMatMul, + &TFLiteGraphOptimiser::RewriteFullyConnectDynamic + } + }, + { + {}, + { + &TFLiteGraphOptimiser::RemoveReshape, + &TFLiteGraphOptimiser::RemoveTranspose, + &TFLiteGraphOptimiser::RemoveReverse, + } + }, + { + {}, + { + &TFLiteGraphOptimiser::RewriteSpaceToBatchConvBatchToSpace, + &TFLiteGraphOptimiser::FixupDilationGT2, + &TFLiteGraphOptimiser::FixupBias, + &TFLiteGraphOptimiser::RewriteDepthwise, + &TFLiteGraphOptimiser::RewriteFullyConnectedInput, + &TFLiteGraphOptimiser::ConvertArgMax, + &TFLiteGraphOptimiser::ConvertBatchedFullyConnected, + &TFLiteGraphOptimiser::ConvertExpToLUT, + &TFLiteGraphOptimiser::ConvertTanhSigmoidToLUT, + &TFLiteGraphOptimiser::ConvertSoftmaxOps, + &TFLiteGraphOptimiser::ReplacePadByExplicitPadding, + &TFLiteGraphOptimiser::ConvertMeanOps, + &TFLiteGraphOptimiser::ConvertPrelu, + &TFLiteGraphOptimiser::ConvertLeakyRelu, + &TFLiteGraphOptimiser::ConvertHardSwishToLUT, + &TFLiteGraphOptimiser::ConvertRSqrtToLUT, + &TFLiteGraphOptimiser::ConvertGather, + &TFLiteGraphOptimiser::RewriteSquaredDifference, + &TFLiteGraphOptimiser::ConvertScatter, + &TFLiteGraphOptimiser::ConvertResize, + &TFLiteGraphOptimiser::UnrollConv, + } + }, + // MoveSplitSliceToConsumer need to be done after any other optimisation that can affect the ifm/ofm shapes + // has been performed, since the ifm/ofm shapes are of importance to this function. + { + {}, + { + &TFLiteGraphOptimiser::ConvertPad, + &TFLiteGraphOptimiser::MoveSplitSliceToConsumer + } + }, + { + { +#if LOG_TRACE1_ON + &GraphOptimiser::VisitTensorLog +#endif + }, + { + &TFLiteGraphOptimiser::CheckReshapeOpsRemoved, +#if LOG_TRACE1_ON + &GraphOptimiser::VisitOperatorLog, +#endif + &GraphOptimiser::RecordOptimisation + } + } + }}; + // clang-format on + + explicit TFLiteGraphOptimiser(Architecture *arch, const GraphOptimiserOptions &options, OptimiserDatabase *db); + + const GraphOptStepArray &GraphOptimisationSteps() const { return _graphOptimisationSteps; } + + void OptimiseGraph(Graph *graph); +}; + +} // namespace regor diff --git a/ethosu/regor/compiler/tflite_graph_optimiser_tp.cpp b/ethosu/regor/compiler/tflite_graph_optimiser_tp.cpp new file mode 100644 index 00000000..3d74371b --- /dev/null +++ b/ethosu/regor/compiler/tflite_graph_optimiser_tp.cpp @@ -0,0 +1,145 @@ +// +// SPDX-FileCopyrightText: Copyright 2021, 2024 Arm Limited and/or its affiliates +// SPDX-FileCopyrightText: Copyright 2020 The TensorFlow Authors. All Rights Reserved. +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "compiler/tflite_graph_optimiser.hpp" + +namespace +{ + +// Implementation from TensorFlow Lite Micro kernel +int16_t SaturatingLeftShift(std::int16_t value, int amount) +{ + int32_t result = value << amount; + result = std::min(result, std::numeric_limits::max()); + result = std::max(result, std::numeric_limits::min()); + return int16_t(result); +} + +// Implementation from TensorFlow Lite Micro kernel +// Similar to ARM instruction SQDMULH. +// Similar to gemmlowp::SaturatingRoundingDoublingHighMul except +// rounding to zero instead of to nearest (SQRDMULH). +int16_t SaturatingDoublingHighMul(int16_t a, int16_t b) +{ + bool overflow = a == b && a == std::numeric_limits::min(); + int32_t a_32(a); + int32_t b_32(b); + int32_t ab_32 = a_32 * b_32; + int16_t ab_x2_high16 = int16_t((ab_32) / (1 << 15)); + return overflow ? std::numeric_limits::max() : ab_x2_high16; +} + +} // namespace + +namespace regor +{ + +// Converts HardSwish to a LUT based solution. +Operation *TFLiteGraphOptimiser::ConvertHardSwishToLUT(Graph *const graph, Operation *const operation) +{ + UNUSED(graph); + auto returnOp = operation; + auto opType = operation->Type(); + auto ifmConn = operation->Input(TensorUsage::IFM0); + auto ofmConn = operation->Output(TensorUsage::OFM); + + if ( opType == OpType::HardSwish && ifmConn != nullptr && ofmConn != nullptr ) + { + auto ifm = ifmConn->tensor.get(); + auto ofm = ofmConn->tensor.get(); + + // Generate the LUT + const double ifmScale = ifmConn->quantization.scales[0].Dequantize(); + const double ofmScale = ofmConn->quantization.scales[0].Dequantize(); + const int zpIn = int(ifmConn->quantization.zeroPoints[0]); + const int zpOut = int(ofmConn->quantization.zeroPoints[0]); + const int qMin = ifm->Type() == DataType::Int8 ? -128 : 0; + const int qMax = ifm->Type() == DataType::Int8 ? 127 : 255; + + const double ifmScaleHires = (1.0 / 128.0) * ifmScale; + const double reluMultiplier = 3.0 / 32768.0; + + QuantizedScale outScale(ifmScaleHires / ofmScale); + QuantizedScale reluScale(ifmScaleHires / reluMultiplier); + int16_t outScale16 = DownScaleInt32ToInt16Multiplier(outScale.scale); + int16_t reluScale16 = DownScaleInt32ToInt16Multiplier(reluScale.scale); + // convert to left shift-positive notation + int outShift = 31 - outScale.shift; + int reluShift = 31 - reluScale.shift; + + std::vector lut; + lut.reserve(256); + for ( int x = qMin; x <= qMax; ++x ) + { + // Compute the "relu-ish multiplier". + // This matches the code in TensorFlow Lite Micro kernel + const int16_t inputValue = int16_t(x - zpIn); + + const int16_t inputValueOnHiresInputScale = int16_t(inputValue << 7); + + const int16_t inputValueOnPreshiftOutputScale = gemmlowp::SaturatingRoundingDoublingHighMul(inputValueOnHiresInputScale, outScale16); + + int16_t reluValue = inputValueOnHiresInputScale; + + if ( reluShift > 0 ) + { + reluValue = SaturatingLeftShift(reluValue, reluShift - 1); + } + + reluValue = gemmlowp::SaturatingRoundingDoublingHighMul(reluValue, reluScale16); + + if ( reluShift > 0 ) + { + reluValue = SaturatingLeftShift(reluValue, 1); + } + + if ( reluShift < 0 ) + { + reluValue = gemmlowp::RoundingDivideByPOT(reluValue, -reluShift); + } + reluValue = int16_t((reluValue + (1 << 15)) >> 1); + + const int16_t preshiftOutputValue = SaturatingDoublingHighMul(reluValue, inputValueOnPreshiftOutputScale); + + int16_t outputValue = gemmlowp::RoundingDivideByPOT(preshiftOutputValue, -outShift); + + int lutVal = outputValue + zpOut; + lutVal = std::min(qMax, std::max(qMin, lutVal)); + lut.push_back(uint8_t(lutVal)); + } + + auto lutTens = CreateConstTensor("hardswish", ifmConn->tensor->Type(), std::make_shared(std::move(lut))); + // The LUT must be applied without any preceding rescaling (the LUT itself performs the rescale), + // so even if the OFM has a different scale than the IFM, the generated OFM scale instructions + // should be the same as the IFM + returnOp = CreateLUT(ifmConn->tensor, lutTens, ifmConn->quantization, ifmConn->quantization, lutTens->Type(), + &ifmConn->shape, ofmConn->tensor, ifmConn->slice, ofmConn->slice); + returnOp->SetRounding(RoundMode::NATURAL); + } + + if ( operation != returnOp ) + { + RecordOptimisation(operation, returnOp); + operation->Disconnect(); + } + + return returnOp; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/tosa_graph_validator.cpp b/ethosu/regor/compiler/tosa_graph_validator.cpp new file mode 100644 index 00000000..4213beda --- /dev/null +++ b/ethosu/regor/compiler/tosa_graph_validator.cpp @@ -0,0 +1,92 @@ +// +// SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "tosa_graph_validator.hpp" + +#include "compiler.hpp" + +#include + +namespace +{ + +std::optional MaybeGetTosaVersion(uint32_t syntaxVersion) +{ + if ( syntaxVersion == 0 ) syntaxVersion = (GraphApi::VERSION_TOSA_0_60 | GraphApi::PROFILE_BASELINE); + if ( (syntaxVersion & GraphApi::VERSION_TOSA_0_60) == GraphApi::VERSION_TOSA_0_60 ) + { + return GraphApi::VERSION_TOSA_0_60; + } + else if ( (syntaxVersion & GraphApi::VERSION_TOSA_0_80) == GraphApi::VERSION_TOSA_0_80 ) + { + return GraphApi::VERSION_TOSA_0_80; + } + else if ( (syntaxVersion & GraphApi::VERSION_TOSA_1_00) == GraphApi::VERSION_TOSA_1_00 ) + { + return GraphApi::VERSION_TOSA_1_00; + } + return std::nullopt; +} + +} // namespace + +namespace regor +{ + +bool TosaGraphValidator::HandlesSyntax(uint32_t syntaxVersion) +{ + return MaybeGetTosaVersion(syntaxVersion).has_value(); +} + +TosaGraphValidator::TosaGraphValidator(GraphNotation notation, uint32_t syntaxVersion, Compiler *compiler) : + GraphValidator(notation, syntaxVersion) +{ + _context.version = MaybeGetTosaVersion(syntaxVersion).value_or(GraphApi::VERSION_TOSA_0_60); + + if ( (syntaxVersion & GraphApi::PROFILE_MAIN) == GraphApi::PROFILE_MAIN ) + { + _context.profile = GraphApi::PROFILE_MAIN; + } + else + { + _context.profile = GraphApi::PROFILE_BASELINE; + } + _context.GetGraph = [compiler](const char *name) { return compiler->GetGraph(name); }; +} + +bool TosaGraphValidator::Validate(Graph *graph) +{ + bool graphValid = true; + Graph::TraverseGraphFromEnd(graph->Outputs(), + [&graphValid, &graph, this](Operation *op) -> bool + { + try + { + tosa::validator::ValidateOperator(op, _context); + } + catch ( const std::invalid_argument &e ) + { + graphValid = false; + _validationErrors.emplace_back(Error{op->Type(), e.what()}); + } + return true; + }); + return graphValid; +} + +} // namespace regor diff --git a/ethosu/regor/compiler/tosa_graph_validator.hpp b/ethosu/regor/compiler/tosa_graph_validator.hpp new file mode 100644 index 00000000..ae5d0686 --- /dev/null +++ b/ethosu/regor/compiler/tosa_graph_validator.hpp @@ -0,0 +1,39 @@ +// +// SPDX-FileCopyrightText: Copyright 2023 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#pragma once + +#include "graph_validator.hpp" +#include "tosa/tosa_validator.hpp" + +#include + +namespace regor +{ + +class TosaGraphValidator : public GraphValidator +{ + tosa::validator::Context _context; + +public: + TosaGraphValidator(GraphNotation notation, uint32_t syntaxVersion, Compiler *compiler); + static bool HandlesSyntax(uint32_t syntaxVersion); + bool Validate(Graph *graph) override; +}; + +} // namespace regor diff --git a/ethosu/regor/dependencies/mlw_codec/CMakeLists.txt b/ethosu/regor/dependencies/mlw_codec/CMakeLists.txt new file mode 100644 index 00000000..14b40296 --- /dev/null +++ b/ethosu/regor/dependencies/mlw_codec/CMakeLists.txt @@ -0,0 +1,183 @@ +# +# SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +cmake_minimum_required(VERSION 3.15.6) +cmake_policy(SET CMP0063 NEW) +project(mlw_codec VERSION 1.0 DESCRIPTION "MLW Codec" LANGUAGES C CXX) +include(GNUInstallDirs) + +if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE Debug) +endif() + +option(DEBUG_PACKET "Debug packet syntax" OFF) +option(DEBUG_BITSTREAM "Debug bitstream contents" OFF) + +if (DEBUG_PACKET) + list(APPEND MLW_DEFINES "ENABLE_DEBUG_PACKET=1") +endif() + +if (DEBUG_BITSTREAM) + list(APPEND MLW_DEFINES "ENABLE_DEBUG_BITSTREAM=1") +endif() + +message(STATUS "Build type: ${CMAKE_BUILD_TYPE}") + +# Add a target library made from the above source files +add_library(mlw_codec_st STATIC + "source/mlw_encode.cpp" + "source/mlw_encode_fwd.cpp" + "source/ml_ethosu_encode.cpp" + "source/mlw_decode.cpp" +) +set_target_properties(mlw_codec_st PROPERTIES OUTPUT_NAME mlw_codec) + +# Default compiler settings +set_property(TARGET mlw_codec_st PROPERTY CXX_STANDARD 11) +set_property(TARGET mlw_codec_st PROPERTY CXX_EXTENSIONS OFF) +set_property(TARGET mlw_codec_st PROPERTY POSITION_INDEPENDENT_CODE ON) + +add_custom_target(mlw_codec DEPENDS mlw_codec_st) + +if (MSVC) + set(MLW_TOOLCHAIN_CXX_OPTIONS + "/DWIN32_LEAN_AND_MEAN" + "/DNOMINMAX" + "/D_USE_MATH_DEFINES" + "/D_CRT_SECURE_NO_WARNINGS" + "/D_CRT_SECURE_CPP_OVERLOAD_STANDARD_NAMES" + "/bigobj" + "/experimental:external" + "/external:W0" + "/external:anglebrackets" + # Diagnostics + "/W3" # Default warning level (severe + significant + production quality). + "/wd4200" # "nonstandard extension used : zero-sized array in struct/union" + "/wd4018" # "signed/unsigned mismatch in comparison" + "/wd4146" # operator applied to unsigned type, result still unsigned + "/wd4244" # possible loss of data + "/wd4267" # initializing: possible loss of data + "/wd4005" # allow: macro redefinition + "/wd4065" # allow: switch statement contains 'default' but no 'case' labels + "/wd4141" # allow: inline used more than once + "/wd4624" # allow: destructor was implicitly defined as deleted + "/wd4146" # operator applied to unsigned type, result still unsigned + "/wd4244" # possible loss of data + "/wd4267" # initializing: possible loss of data + "/wd5105" # allow: macro expansion producing 'defined' has undefined behavior + ) +else() + set(MLW_TOOLCHAIN_CXX_OPTIONS + # Enabled compiler options + -Wall -Wextra -Wsign-compare -Wold-style-cast -Wswitch-default + -Wformat -Wdouble-promotion -Wredundant-decls -Wlogical-op + -Wnon-virtual-dtor -Wcast-align -Wshadow + # Disabled compiler options + -Wno-format-contains-nul -Wno-format-extra-args + -Wno-unused-function -Wno-unused-label -Wno-maybe-uninitialized -Wno-unused + -Wno-unused-local-typedefs -Wno-stringop-truncation -Wno-missing-field-initializers + # Config specific options + $<$:-Werror> + ) +endif() + +# Config specific compiler defines +set(MLW_TOOLCHAIN_CXX_DEFINES + RELEASE=$> + DEBUG=$>) + +# Configure target's compilation options +target_compile_definitions(mlw_codec_st PRIVATE ${MLW_DEFINES} $<$:${MLW_TOOLCHAIN_CXX_DEFINES}>) +target_compile_options(mlw_codec_st PRIVATE $<$:${MLW_TOOLCHAIN_CXX_OPTIONS}>) + +# Properties +file(GLOB MLW_CODEC_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/include/*) + +set_target_properties(mlw_codec_st PROPERTIES + VERSION ${${PROJECT_NAME}_VERSION} + #PUBLIC_HEADER "${MLW_CODEC_HDRS}" +) + +# Include directories (private and public interface) +target_include_directories(mlw_codec_st + PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}" + INTERFACE + "$" + "$/${CMAKE_INSTALL_INCLUDEDIR}>" +) + +# Install +install(TARGETS mlw_codec_st + EXPORT ${PROJECT_NAME} + LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" + ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}" + RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" + PUBLIC_HEADER DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/${PROJECT_NAME}" +) +if(MSVC AND NOT CMAKE_VERSION VERSION_LESS 3.15) + install(FILES + "$/$$.pdb" + DESTINATION ${CMAKE_INSTALL_LIBDIR} + OPTIONAL + ) +endif() +install(EXPORT ${PROJECT_NAME} + NAMESPACE ${PROJECT_NAME}:: + DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/${PROJECT_NAME}" +) + +# Aliases +add_library(${PROJECT_NAME}::mlw_codec_st ALIAS mlw_codec_st) + +# Use a PEP-656 compliant package tag +# The default value for this variable is not useful +if (NOT Python3_FOUND) + set(Python3_FIND_STRATEGY VERSION) + set(Python3_FIND_REGISTRY LAST) + set(Python3_FIND_FRAMEWORK LAST) + find_package(Python3 COMPONENTS Interpreter REQUIRED) +endif() +execute_process( + COMMAND ${Python3_EXECUTABLE} -c "import sysconfig; print(sysconfig.get_platform())" + OUTPUT_VARIABLE MLW_CODEC_SYSTEM_NAME OUTPUT_STRIP_TRAILING_WHITESPACE) + +set(CPACK_PACKAGE_NAME ${PROJECT_NAME}) +# Default variables +set(CPACK_PACKAGE_VENDOR "Arm") +set(CPACK_PACKAGE_DESCRIPTION "${PROJECT_DESCRIPTION}") +set(CPACK_PACKAGE_VERSION_MAJOR "${${PROJECT_NAME}_VERSION_MAJOR}") +set(CPACK_PACKAGE_VERSION_MINOR "${${PROJECT_NAME}_VERSION_MINOR}") +if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + set(CPACK_STRIP_FILES FALSE) + set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-dev-${MLW_CODEC_SYSTEM_NAME}) +else() + set(CPACK_STRIP_FILES TRUE) + set(CPACK_PACKAGE_FILE_NAME ${CPACK_PACKAGE_NAME}-${MLW_CODEC_SYSTEM_NAME}) +endif() +set(CPACK_VERBATIM_VARIABLES TRUE) + +# Archive generator setup +set(CPACK_BINARY_TGZ ON) +set(CPACK_BINARY_STGZ OFF) +set(CPACK_BINARY_TBZ2 OFF) +set(CPACK_BINARY_TXZ OFF) +set(CPACK_BINARY_TZ OFF) +set(CPACK_INSTALL_CMAKE_PROJECTS + "${CMAKE_CURRENT_BINARY_DIR};${CMAKE_PROJECT_NAME};/") + +include(CPack) diff --git a/ethosu/regor/dependencies/mlw_codec/include/mlw_decode.h b/ethosu/regor/dependencies/mlw_codec/include/mlw_decode.h new file mode 100644 index 00000000..8aca4ea2 --- /dev/null +++ b/ethosu/regor/dependencies/mlw_codec/include/mlw_decode.h @@ -0,0 +1,50 @@ +// +// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef __MLW_DECODE_H__ +#define __MLW_DECODE_H__ + +#pragma once + +#include + +// Result of the decode process +typedef struct ml_decode_result_t +{ + int16_t *decoded_data; // decoded weight elements + int32_t decoded_length; // decoded weight length (in elements) + int32_t section_count; // number of sections in stream + int32_t *section_sizes; // section sizes in stream +} ml_decode_result_t; + + +#if defined __cplusplus +extern "C" +{ +#endif + + void ml_decode_ethosu_stream(ml_decode_result_t *result, const uint8_t *buffer, int size_bytes); + + void mld_free(ml_decode_result_t *result); + +#if defined __cplusplus +} // extern "C" +#endif + + +#endif diff --git a/ethosu/regor/dependencies/mlw_codec/include/mlw_encode.h b/ethosu/regor/dependencies/mlw_codec/include/mlw_encode.h new file mode 100644 index 00000000..3b728f33 --- /dev/null +++ b/ethosu/regor/dependencies/mlw_codec/include/mlw_encode.h @@ -0,0 +1,114 @@ +// +// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#if !defined MLW_ENCODE_H +#define MLW_ENCODE_H + +#include +#include + +#if defined _MSC_VER + #define MLW_CODEC_PACKED + #define MLW_CODEC_USE_PACK_PRAGMA (1) +#else // __GNUC__ and clang + #define MLW_CODEC_PACKED __attribute__((packed)) +#endif + +// Encoder input parameters EthosU +typedef struct ml_ethosu_encode_params_t +{ + int32_t source_buffering_hint; // Recommend a buffering size + uint16_t encoder_flags; // Control flags to pass to the encoder + void* (*realloc_func)(void*, size_t, int purpose); // Custom output allocator function +} ml_ethosu_encode_params_t; + +// Resulting encoded section information +typedef struct ml_encode_section_t +{ + int32_t offset; // Byte offset of encoded section + int32_t size; // Byte size of encoded section + int32_t zeroes; // Number of zeroes encoded in a section + int8_t group_start; // Start of group +} ml_encode_section_t; + +// Result of the encode process +typedef struct ml_encode_result_t +{ + uint8_t *encoded_data; // Encoded weight data + int32_t encoded_length; // Encoded weight length (in bytes) + int32_t source_length; // Source elements read + ml_encode_section_t *section_info; // Array of sections in stream + int32_t section_count; // Number of section in stream +} ml_encode_result_t; + +#define MLW_SOURCE_QUERY_WEIGHTS 0 +#define MLW_SOURCE_QUERY_SHIFTS 1 + +// State of the source iterator +typedef struct ml_source_state_t +{ + uint8_t new_dim_mask; // Dimension start mask + uint8_t end_dim_mask; // Dimension end mask + bool eos; // End-of-stream flag +} ml_source_state_t; + +// Stream input callback (encoder will collect input through this function, of the size recommended by the buffering hint) +typedef int32_t (*ml_weight_source_fn)(int32_t query, ml_source_state_t *state, int16_t *buffer, int32_t size, void *user_arg); + +// Internal state context +typedef struct mle_context_t mle_context_t; + +#define MLW_ENCODE_FLAG_NONE (0) // Default encoding flag +#define MLW_ENCODE_NO_BITSTREAM (1) // Do not write any bitstream data (only return the length) +#define MLW_ENCODE_INSERT_PALETTE (2) // Insert a new palette header with this encode +#define MLW_ENCODE_RESET_PALETTE (4) // Clear and recalculate the palette header +#define MLW_ENCODE_PARTIAL_DATA (8) // Frequency analysis and palette will be constructed from incomplete data +#define MLW_ENCODE_NO_PADDING (16) // Disable trailing padding +#define MLW_ENCODE_NO_PALETTE_LUT (32) // Disable palette LUT generation +#define MLW_ENCODE_NO_ZERO_RUNS (64) // Disable zero run generation +#define MLW_ENCODE_DPIC_FORCE_PARAMS (128) // Force debug parameters +#define MLW_ENCODE_NEW_PALETTE (MLW_ENCODE_INSERT_PALETTE|MLW_ENCODE_RESET_PALETTE) + +#define MLW_ENCODE_SYNTAX_ETHOSU (0) // EthosU bitstream encode syntax +#define MLW_ENCODE_SYNTAX_ETHOSU_FWD (2) // EthosU FWD bitstream encode syntax + +#define MLW_ENCODE_ALLOC_GENERAL (0) // General allocations used by the encoder +#define MLW_ENCODE_ALLOC_METADATA (1) // Allocation for codec's metadata output +#define MLW_ENCODE_ALLOC_STREAM0 (2) // Stream 0 allocation for this codec +#define MLW_ENCODE_ALLOC_STREAM1 (3) // Stream 1 allocation for this codec + +#if defined __cplusplus +extern "C" +{ +#endif + // Baseline encode + mle_context_t *mle_create_context(int syntax); + int mle_context_query_zeroes(mle_context_t *ctx); + void mle_context_set_allocator(mle_context_t *ctx, void* (*realloc_func)(void*, size_t, int purpose)); + void mle_destroy_context(mle_context_t *ctx); + int mle_encode(mle_context_t *ctx, ml_encode_result_t *result, const int16_t *inbuf, int inbuf_size, unsigned mlw_encode_flags); + void mle_free(ml_encode_result_t *result); + + int32_t ml_encode_ethosu_stream(ml_encode_result_t *result, const ml_ethosu_encode_params_t *ep, ml_weight_source_fn src, void *user_arg, mle_context_t **ctx_out); + +#if defined __cplusplus +} // extern "C" +#endif + + +#endif // MLW_ENCODE_H diff --git a/ethosu/regor/dependencies/mlw_codec/source/ml_bit_buffer.hpp b/ethosu/regor/dependencies/mlw_codec/source/ml_bit_buffer.hpp new file mode 100644 index 00000000..61369d75 --- /dev/null +++ b/ethosu/regor/dependencies/mlw_codec/source/ml_bit_buffer.hpp @@ -0,0 +1,255 @@ +// +// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#if !defined ML_BIT_BUFFER_HPP +#define ML_BIT_BUFFER_HPP + +#pragma once + +#include "ml_raw_buffer.hpp" + +#include + +struct bitbuf_t +{ +private: + uint32_t *_buf; + uint32_t _next = 0; + int _pos; // bit pos of next bit + int _limit; // in bytes + int _substream_start = 0; // start position for substreams + int _substream_end = 0; // end position for substreams + bool _enabled = false; + raw_buffer_t *_buffer; +public: + // Read constructor + bitbuf_t(const void *buf, int used_bytes) : _buffer(nullptr) + { + _limit = used_bytes & ~3; + _buf = reinterpret_cast(const_cast(buf)); + _pos = 0; + } + + // Write constructor + bitbuf_t(raw_buffer_t &buffer, int reserve_bytes, bool disable_writes) : _buffer(&buffer) + { + _enabled = !disable_writes; + if ( _enabled ) _buffer->reserve(reserve_bytes); + _limit = _buffer->capacity() & ~3; + prime( _buffer->used() * 8 ); // Start at the end of the buffer's used data + } + + // Sub-stream writer + bitbuf_t(bitbuf_t &dest, int bitpos, int bitlen=0) : _buffer(dest._buffer) + { + _limit = _buffer->capacity() & ~3; + _substream_start = bitpos; + bitlen = (bitlen <= 0) ? (_limit * 8) - bitpos : bitlen; // Default to rest of buffer + _substream_end = bitpos + bitlen; + int required = (_substream_end + 7 ) / 8; + assert( required <= _limit ); + _enabled = dest._enabled; + prime( bitpos ); + } + +public: + void put(int len, int32_t data) + { + assert( _buf == reinterpret_cast(_buffer->begin()) && "Buffer resized externally" ); + assert( (data & ((1 << len)-1)) == data && "Data must be pre-masked" ); + assert( ((_substream_end == 0) || (_pos + len <= _substream_end)) && "Write past end of substream section" ); + if ( len > 0 && _enabled ) + { + uint32_t next = _next; + int bitpos = _pos & 0x1F; + next |= uint32_t(data) << bitpos; + + if ( len >= (32 - bitpos) ) + { + // Write won't fit, reserve more output space + if ( (_pos / 8) >= _limit ) + { + extend(); + } + + _buf[_pos >> 5] = next; // Requires little-endian + next = uint32_t(data) >> (32 - bitpos); + } + + _next = next; + } + + _pos += len; + } + + void put_masked(int len, int32_t data) + { + put(len, data & ((1 << len)-1)); + } + + void fill(int len, unsigned bit) + { + const uint32_t mask = 0xFFFFFFFF * bit; + int remain = len; + while ( remain >= 32 ) + { + put(32, mask); + remain -= 32; + } + if (remain > 0) + put(remain, mask & ((1u << remain) - 1) ); + } + + void align(int bits, int fill_byte) + { + // Alignments must be power of 2 + assert( (bits & (bits - 1)) == 0 && bits ); + const int mask = bits-1; + + int distance = (bits - (_pos & mask)) & mask; + + // Byte align first + put_masked( distance & 7, fill_byte ); + distance &= ~7; + while (distance != 0) + { + put(8, fill_byte); + distance -= 8; + } + } + + void reposition(int bitpos) + { + int end = (_substream_end != 0) ? _substream_end : _limit * 8; + assert( (bitpos >= 0 && bitpos <= end) && "Can't reposition out of stream" ); + assert( (_substream_end == 0 || bitpos >= _substream_start) && "Can't reposition before substream"); + if ((_pos != bitpos) && (bitpos > 0) && (bitpos <= end) ) + { + // Reposition in bitstream. Caller must flush if writing. + prime(bitpos); + } + } + + void flush(bool done=true) + { + if ( !_enabled ) return; + // If buffering word is not empty, write it out as-is. + if ( _pos & 0x1F ) + { + // If writing a substream, blend any overlapping words with the parent stream + if ( _substream_end ) + { + int remain = _substream_end - (_pos & ~0x1F); // Remaining word-bits in this substream + if ( remain < 32 ) // Will overlap happen? + { + uint32_t mask = ~0u << (32 - remain); // Mask the parent bits that we want to keep + _next = (_buf[_pos >> 5] & mask) | (_next & ~mask); + } + } + // Otherwise limited by the buffer + else + { + // Only extend by space required to flush remaining word. + if ( (_pos / 8) >= _limit ) + { + extend(true); + } + } + _buf[_pos >> 5] = _next; + } + if ( done && !_substream_end ) + { + _buffer->set_used( _pos / 8 ); + } + } + + void sync(bitbuf_t &substream) + { + flush(false); + substream.flush(false); + prime( std::max(_pos, substream._pos) ); + substream._buffer = nullptr; + } + + int get(int len) + { + if ( len == 0 ) + { + return 0; + } + + const unsigned mask = (1u << len) - 1; + assert( (_pos / 8) < _limit ); + uint32_t next = _buf[_pos >> 5]; + int bitpos = _pos & 0x1F; + // Bits from this word + unsigned value = next >> bitpos; + _pos += len; + + // Some of the bits are in the next word + if ( len > (32 - bitpos) ) + { + assert( (_pos / 8) < _limit ); + next = _buf[_pos >> 5]; + value |= next << (32 - bitpos); + } + + return int(value & mask); + } + + void read_align(int bits) + { + // Alignments must be power of 2 + assert( (bits & (bits - 1)) == 0 && bits ); + const int mask = bits-1; + _pos += (bits - (_pos & mask)) & mask; + } + + bool read_eos() const { return _pos/8 >= _limit; } + + int read_avail() const { return (_limit - (_pos / 8)) * 8 - (_pos & 7); } + int read_avail(int watermark) const { return (watermark - (_pos / 8)) * 8 - (_pos & 7); } + + int pos() const { return _pos; } + int byte_pos() const { return _pos / 8; } + int byte_length() const { return _limit; } + +private: + void prime(int bitpos) + { + assert( (bitpos >= 0) && (bitpos / 8) < _limit ); + // Prime (start up) the bitstream writer at the given bit position + _pos = bitpos; + _buf = reinterpret_cast(_buffer->begin()); + _next = _buf[bitpos >> 5]; + _next &= (1u << (bitpos & 0x1F)) - 1; + } + + void extend(bool exact_resize=false) + { + assert(_enabled); + _buffer->set_used( (_pos / 8) & ~3 ); // Only use whole words + _buffer->reserve( sizeof(uint32_t), exact_resize ); // Buffer implementation must optimise small requests + assert( (_buffer->capacity() & ~3) > _limit ); + assert( _substream_end == 0 ); // Can't extend a substream + _limit = _buffer->capacity() & ~3; + _buf = reinterpret_cast(_buffer->begin()); + } +}; + +#endif // ML_BIT_BUFFER_HPP diff --git a/ethosu/regor/dependencies/mlw_codec/source/ml_encoder_internal.hpp b/ethosu/regor/dependencies/mlw_codec/source/ml_encoder_internal.hpp new file mode 100644 index 00000000..dc9b8b28 --- /dev/null +++ b/ethosu/regor/dependencies/mlw_codec/source/ml_encoder_internal.hpp @@ -0,0 +1,128 @@ +// +// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#if !defined ML_ENCODER_INTERNAL_HPP +#define ML_ENCODER_INTERNAL_HPP + +#pragma once + +#include "../include/mlw_encode.h" +#include "ml_bit_buffer.hpp" + +#include +#include + +#if __GNUC__ + #define ML_ENCODER_DLL_EXPORT __attribute__((visibility("default"))) +#elif _WIN32 + #if TARGET_WIN32_DLL + #define ML_ENCODER_DLL_EXPORT __declspec(dllexport) + #else + #define ML_ENCODER_DLL_EXPORT + #endif +#else + #error "undefined export semantics" +#endif + +#if !defined ENABLE_DEBUG_PACKET + #define ENABLE_DEBUG_PACKET (0) +#endif + +#if !defined ENABLE_DEBUG_BITSTREAM + #define ENABLE_DEBUG_BITSTREAM (0) +#endif + +#if ENABLE_DEBUG_PACKET + #include + #define PACKET_LOG(...) printf(__VA_ARGS__) +#else + #define PACKET_LOG(...) +#endif + +#if ENABLE_DEBUG_BITSTREAM + #include + #define BITSTREAM_LOG(...) printf(__VA_ARGS__) +#else + #define BITSTREAM_LOG(...) +#endif + +constexpr int ETHOSU_SLICELEN_BITS = 15; +constexpr int ZDIV_DISABLE = 6; // not alternating mode +constexpr int ZDIV_EOS = 7; // indicates end of stream +constexpr int WDIV_UNCOMPRESSED = 7; // indicates uncompressed weights + +struct palette_t +{ + int16_t lut[32] = {0}; + int16_t inv_lut[512] = {0}; + int freq[512] = {0}; + int palsize; // number of palette entries + int palbits; // bit width of palette entries + int direct_offset; // added to the decoded weight index before direct conversion to sign/mag + bool use_zero_runs; // zeros are coded separately + bool only_palette; // no values outside the palette + bool only_zeros; // special case that the section is all zeros +}; + +struct slice_params_t +{ + uint8_t w_grc_trunc; + bool w_uncompressed; + uint8_t z_grc_div; + uint8_t w_grc_div; +}; + +struct mle_slice_debug_t +{ + slice_params_t params; + palette_t palette; +}; + +struct mle_context_t +{ + palette_t palette; + int syntax = 0; + int zero_count = 0; + int slicelen_bits = ETHOSU_SLICELEN_BITS; + bool palette_valid = false; + bool single_slice_sections = false; + bool allow_empty_slices = false; + bool eos_required = false; + bool enable_slice_debug = false; + bool disable_lut = false; + int8_t fixed_wgrc = -1; + int8_t fixed_zgrc = -1; + std::vector slice_debug; + void* (*realloc_func)(void*, size_t, int); // Custom output allocator function +}; + +inline int div_round_up(int num, int div) +{ + return (num + div - 1) / div; +} + +int ml_encode_fwd(mle_context_t *ctx, bitbuf_t &bits, const int16_t *weights, int encode_count, unsigned mlw_encode_flags); +int ml_encode_section(mle_context_t *ctx, const int16_t *inbuf, int size, palette_t *p, bitbuf_t *bitbuf); +palette_t *ml_encode_palette(mle_context_t *ctx, const int16_t *weights, int encode_count, int analyse_count, unsigned mlw_encode_flags); +void ml_encode_eos(mle_context_t *ctx, bitbuf_t &bits, unsigned mlw_encode_flags); +int ml_encode_internal(mle_context_t *ctx, bitbuf_t &bits, const int16_t *weights, int encode_count, int analyse_count, unsigned mlw_encode_flags); + +#endif // ML_ENCODER_INTERNAL_HPP + + + diff --git a/ethosu/regor/dependencies/mlw_codec/source/ml_ethosu_encode.cpp b/ethosu/regor/dependencies/mlw_codec/source/ml_ethosu_encode.cpp new file mode 100644 index 00000000..14f92b17 --- /dev/null +++ b/ethosu/regor/dependencies/mlw_codec/source/ml_ethosu_encode.cpp @@ -0,0 +1,114 @@ +// +// SPDX-FileCopyrightText: Copyright 2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "../include/mlw_encode.h" + +#include "ml_bit_buffer.hpp" +#include "ml_raw_buffer.hpp" +#include "ml_encoder_internal.hpp" + +#include +#include + +#if defined __cplusplus +extern "C" +{ +#endif + + +ML_ENCODER_DLL_EXPORT int32_t ml_encode_ethosu_stream(ml_encode_result_t *result, const ml_ethosu_encode_params_t *ep, ml_weight_source_fn src, void *user_arg, mle_context_t **ctx_out) +{ + constexpr int BUFFERING_REQUEST_SIZE = 8192; // Initial input buffering + constexpr int INITIAL_OUTPUT_BUFFER = 8192; // Initial size of output buffer (doubles at every overflow) + constexpr unsigned VALID_FLAGS = MLW_ENCODE_NO_BITSTREAM; + + assert(result && ep); + if ( !(result && ep && src) ) + { + return 0; + } + + mle_context_t *ctx = mle_create_context(MLW_ENCODE_SYNTAX_ETHOSU); + // Allow forcing parameters for debug validation - it is expected that + // the caller knows what they're doing here since it accesses the opaque + // internals via the public interface. + + assert( !(ep->encoder_flags & ~(VALID_FLAGS)) ); // Check acceptable flags + unsigned ethosu_encode_flags = (ep->encoder_flags & VALID_FLAGS); + + // Input buffering of data from the source function + assert( ep->source_buffering_hint >= 0 ); + int request_size = std::max(BUFFERING_REQUEST_SIZE, ep->source_buffering_hint & 0x00FFFFFF); + raw_buffer_t buffer( request_size ); + + // The source function will communicate the state to this encoding loop + ml_source_state_t state = {0}; + state.eos = false; + + // Output bitstream allocation + raw_buffer_t output(INITIAL_OUTPUT_BUFFER, MLW_ENCODE_ALLOC_STREAM0, ep->realloc_func); + bitbuf_t bits(output, 8, ethosu_encode_flags & MLW_ENCODE_NO_BITSTREAM); + + result->source_length = 0; + + // Repeatedly ask for values until the source function signals end-of-stream. + while ( !state.eos ) + { + int16_t* buf_write = buffer.reserve(request_size); + int received = (*src)(MLW_SOURCE_QUERY_WEIGHTS, &state, buf_write, buffer.capacity() - buffer.used(), user_arg); + buffer.use(received); + + unsigned encode_flags = ethosu_encode_flags; + encode_flags |= MLW_ENCODE_NEW_PALETTE; + + int bytes_written = ml_encode_internal(ctx, bits, buffer.begin(), buffer.used(), buffer.used(), encode_flags); + if ( bytes_written < 0 ) + { + // Encoder errored + mle_destroy_context(ctx); + return -1; + } + result->source_length += buffer.used(); + buffer.clear(); + } + + ml_encode_eos(ctx, bits, ethosu_encode_flags); + + // Populate the return result + assert(bits.byte_pos() == output.used() || (ethosu_encode_flags & MLW_ENCODE_NO_BITSTREAM)); + result->encoded_length = bits.byte_pos(); + result->encoded_data = output.detach(); + result->section_info = nullptr; + result->section_count = 0; + + if (ctx_out != nullptr) + { + assert( *ctx_out == nullptr ); + *ctx_out = ctx; + } + else + { + mle_destroy_context(ctx); + } + return 1; +} + + +#if defined __cplusplus +} // extern "C" +#endif diff --git a/ethosu/regor/dependencies/mlw_codec/source/ml_raw_buffer.hpp b/ethosu/regor/dependencies/mlw_codec/source/ml_raw_buffer.hpp new file mode 100644 index 00000000..62277d4b --- /dev/null +++ b/ethosu/regor/dependencies/mlw_codec/source/ml_raw_buffer.hpp @@ -0,0 +1,161 @@ +// +// SPDX-FileCopyrightText: Copyright 2022-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#if !defined ML_RAW_BUFFER_HPP +#define ML_RAW_BUFFER_HPP + +#pragma once + +#include +#include +#include +#include +#include +#include + +typedef void* (*realloc_t)(void *ptr, size_t size, int); + +template +struct raw_buffer_t +{ + static_assert(std::is_trivially_copyable::value, "expected simple storage type"); + constexpr static int CAPACITY_ALIGN = 16; + TYPE *_data; + int _used; + int _capacity; + int _reallocArg = 0; + realloc_t _realloc=&realloc_proxy; + +public: + raw_buffer_t(int capacity, int arg=0, realloc_t rfunc=nullptr) + { + assert(capacity > 0); + _realloc = (rfunc != nullptr) ? rfunc : &realloc_proxy; + _capacity = (capacity + CAPACITY_ALIGN - 1) & ~(CAPACITY_ALIGN - 1); + _reallocArg = arg; + _data = reinterpret_cast(_realloc(nullptr, _capacity * sizeof(TYPE), _reallocArg)); + _used = 0; + } + + raw_buffer_t(raw_buffer_t &&other) + { + _capacity = other._capacity; + other._capacity = 0; + _data = other._data; + other._data = nullptr; + _used = other._used; + other._used = 0; + _reallocArg = other._reallocArg; + _realloc = other._realloc; + } + + raw_buffer_t(TYPE *data, int used, int capacity) + { + _data = data; + _used = used; + _capacity = capacity; + } + + ~raw_buffer_t() + { + if (_data) + { + _realloc(_data, 0, _reallocArg); + } + } + + TYPE *begin() { return _data; } + TYPE *end() { return _data + _used; } + int used() const { return _used; } + int capacity() const { return _capacity; } + void clear() { _used = 0; } + + const TYPE &operator[](int index) const { assert(index < _used); return _data[index]; } + + void set_used(int used) + { + assert(used >= _used); + assert(used <= _capacity); + _used = used; + } + + TYPE *reserve(int count, bool exact_resize=false) + { + int req_capacity = _used + count; + if ( req_capacity > _capacity ) + { + if ( !exact_resize ) + { + req_capacity = std::max(req_capacity, _capacity * 2); + } + + auto *p = reinterpret_cast( _realloc(_data, req_capacity * sizeof(TYPE), _reallocArg) ); + if ( !p ) + { + return nullptr; + } + _data = p; + _capacity = req_capacity; + } + int at = _used; + return _data + at; + } + + TYPE *use(int count) + { + int at = _used; + _used += count; + return _data + at; + } + + TYPE *detach() + { + auto tmp = _data; + _data = nullptr; + return tmp; + } + + void align(int align_bytes, TYPE fill) + { + int count = (((_used + align_bytes - 1) / align_bytes) * align_bytes) - _used; + TYPE *p = reserve(count); + use(count); + while (count--) + { + *p++ = fill; + } + } + + void remove_left(int count) + { + int to_move = _used - count; + if (to_move >= 0) + { + memmove(_data, _data + count, to_move * sizeof(TYPE)); + } + _used = to_move; + } + +private: + static void *realloc_proxy(void *ptr, size_t size, int) + { + return realloc(ptr, size); + } +}; + +#endif // ML_RAW_BUFFER_HPP diff --git a/ethosu/regor/dependencies/mlw_codec/source/mlw_decode.cpp b/ethosu/regor/dependencies/mlw_codec/source/mlw_decode.cpp new file mode 100644 index 00000000..15d597d0 --- /dev/null +++ b/ethosu/regor/dependencies/mlw_codec/source/mlw_decode.cpp @@ -0,0 +1,361 @@ +// +// SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "../include/mlw_decode.h" +#include "ml_encoder_internal.hpp" +#include "ml_bit_buffer.hpp" + +#include +#include +#include + +#if NDEBUG || 1 + // Release build get bits macro + #define bitbuf_get(bb_, name_, len_) bb_.get(len_) +#else + #include + + // Debug build get bits macro + inline int bitbuf_get(bitbuf_t &bb, const char *name, int len) + { + assert(len <= 32); + int tmp = bb.get(len); + printf("%6d %s:%d = %d\n", bb.pos() - len, name, len, tmp); + fflush(stdout); + return tmp; + } +#endif + +// Extract and decode weights from the given bitstream +// +// outbuf - output decoded weights +// bb - input bitstream buffer (wherever it is positioned) +// endpos - bitstream end position (in bytes) +// single_slice - process slices one-at-a-time +// slice_len - slice length in bits +// +// Returns - the number of weights extracted from the bitstream +static int ml_decode_internal(raw_buffer_t &outbuf, bitbuf_t &bb, palette_t &palette, int endpos, bool single_slice, int slice_len) +{ + int start_offset = outbuf.used(); + int w_cnt; + int w_grc_div; + int w_grc_trunc; + int w_uncompressed; + int z_grc_div; + int z_prev_grc_div = -1; + bool new_palette; + int i, j; + endpos = std::min(endpos, bb.byte_length()); + + // Loop over all slices + do { + // Decode slice header + int bits_avail = bb.read_avail(endpos); + if ( bits_avail >= 3 ) + { + z_grc_div = bitbuf_get(bb, "ZDIV", 3); + } + else // Insufficient bits left for a terminator (EOS may be optional) + { + z_grc_div = ZDIV_EOS; + int ones = bb.get(bits_avail); + assert( ones == (1 << bits_avail) - 1 ); + } + + while ( z_grc_div == ZDIV_EOS ) + { + // End of stream + // Byte align + bb.read_align(8); + if ( bb.byte_pos() >= endpos || single_slice ) + { + goto labelExit; + } + z_grc_div = bitbuf_get(bb, "ZDIV", 3); + } + if ( bb.read_avail(endpos) <= 0 ) + { + assert(false); + break; // Unexpectedly reached end of the input stream + } + assert(z_grc_div < 4 || z_grc_div == ZDIV_DISABLE); + bool use_zero_runs = z_grc_div != ZDIV_DISABLE; // alternating grc + w_cnt = bitbuf_get(bb, "SLICELEN", slice_len) + (slice_len == ETHOSU_SLICELEN_BITS ? 1 : 0); + w_grc_div = bitbuf_get(bb, "WDIV", 3); + w_grc_trunc = bitbuf_get(bb, "WTRUNC", 1); + new_palette = bitbuf_get(bb, "NEWPAL", 1); + if ( !new_palette ) + { + // At the moment it is not supported to change between alternating + // and non-alternating without redefining the palette (this is because + // the zero is not included in the palette in case of alternating) + bool prev_use_zero_run = z_prev_grc_div != ZDIV_DISABLE; + (void)(prev_use_zero_run); + assert((z_prev_grc_div == -1) || (use_zero_runs == prev_use_zero_run)); + } + z_prev_grc_div = z_grc_div; + if ( new_palette ) + { + palette.direct_offset = bitbuf_get(bb, "DIROFS", 5); + palette.palsize = bitbuf_get(bb, "PALSIZE", 5); + if ( palette.palsize > 0 ) + { + palette.palsize++; + } + palette.palbits = bitbuf_get(bb, "PALBITS", 3) + 2; + for ( i = 0; i < palette.palsize; i++ ) + { + palette.inv_lut[i] = int16_t(bitbuf_get(bb, "PALETTE", palette.palbits)); + } + } + + if ( w_grc_div == WDIV_UNCOMPRESSED ) + { + // Uncompressed mode + w_uncompressed = 1; + int uncompressed_bits; + if ( palette.palsize > 0 ) + { + // Uncompressed bits is given by palette size. + uncompressed_bits = 0; + while ( (1 << uncompressed_bits) < palette.palsize ) + { + uncompressed_bits++; + } + } + else + { + // No palette. PALBITS is used to specify uncompressed bits. + uncompressed_bits = palette.palbits; + } + // In uncompressed mode there's only a remainder part (no unary) + // This is achieved by setting w_grc_div to index bit width + w_grc_div = uncompressed_bits; + } + else + { + w_uncompressed = 0; + assert(w_grc_div < 6); + } + + // Decode the slice + int z_cnt = w_cnt + (( slice_len != ETHOSU_SLICELEN_BITS || new_palette ) ? 1 : 0); + std::vector w_value(w_cnt); + std::vector z_value(z_cnt); + int w_pos = 0, z_pos = 0; + int w_prev_pos = 0, z_prev_pos = 0; + int w_unary0 = 0, w_unary1 = 0, w_unary1_len = 0, w_q[12] = {0}, wq = 0; + int z_unary = 0, z_q[12] = {0}, zq = 0; + int w_nsymbols = 0; + int w_prev_enable = 0, w_prev_nsymbols = 0, w_prev_q[12] = {0}; + int z_nsymbols = 0; + int z_prev_enable = 0, z_prev_nsymbols = 0, z_prev_q[12] = {0}; + int total_zcnt = 0; + int z_unary_len = z_grc_div < 3 ? 12 : 8; + + // Loop over all chunks in the slice + do + { + // Flow control to possibly throttle either the weights or zero-runs + int balance = use_zero_runs ? w_pos - z_pos : 0; + int w_enable = (balance < 8 || !use_zero_runs) && w_pos < w_cnt; + int z_enable = (balance >= 0 && use_zero_runs) && z_pos < z_cnt; + if ( w_enable ) + { + w_unary0 = w_uncompressed ? 0 : bitbuf_get(bb, "WUNARY0", 12); + } + if ( z_enable ) + { + z_unary = bitbuf_get(bb, "ZUNARY", z_unary_len); + z_nsymbols = 0; + for ( i = 0; i < z_unary_len; i++ ) + { + if ( z_unary & (1 << i) ) + { + zq++; + } + else + { + z_q[z_nsymbols++] = zq; + zq = 0; + } + } + z_pos += z_nsymbols; + } + + if ( w_enable ) + { + w_unary1_len = 0; + int max_symbols = w_uncompressed && w_grc_div > 5 ? 8 : 12; + if ( w_unary0 != 0 ) + { + for ( i = 0; i < max_symbols; i++ ) + { + if ( w_unary0 & (1 << i) ) + { + w_unary1_len++; + } + } + } + w_unary1 = (w_unary1_len > 0) ? bitbuf_get(bb, "WUNARY1", w_unary1_len) : 0; + w_nsymbols = 0; + + for ( i = 0; i < max_symbols && (w_nsymbols < (w_cnt - w_pos)); i++ ) + { + int code = 0; + if ( w_unary0 & (1 << i) ) + { + code = 1 + ( w_unary1 & 1 ); + w_unary1 = w_unary1 >> 1; + } + wq += code; + if ( code < 2 || w_grc_trunc ) + { + w_q[w_nsymbols++] = wq; + wq = 0; + } + } + w_pos += w_nsymbols; + } + + // Remainders corresponding to the quotients in the previous chunk + if ( w_prev_enable ) + { + for ( i = 0; i < w_prev_nsymbols && w_prev_pos < w_cnt; i++, w_prev_pos++ ) + { + int remain = bitbuf_get(bb, "WREMAIN", w_grc_div); + w_value[w_prev_pos] = (w_prev_q[i] << w_grc_div) + remain; + } + } + if ( z_prev_enable ) + { + for ( i = 0; i < z_prev_nsymbols && z_prev_pos < z_cnt; i++, z_prev_pos++ ) + { + int remain = 0; + if ( z_grc_div != 0 ) + { + remain = bitbuf_get(bb, "ZREMAIN", z_grc_div); + } + z_value[z_prev_pos] = (z_prev_q[i] << z_grc_div) + remain; + total_zcnt += z_value[z_prev_pos]; + } + } + w_prev_enable = w_enable; + w_prev_nsymbols = w_nsymbols; + std::copy(std::begin(w_q), std::end(w_q), std::begin(w_prev_q)); + z_prev_enable = z_enable; + z_prev_nsymbols = z_nsymbols; + std::copy(std::begin(z_q), std::end(z_q), std::begin(z_prev_q)); + } while ( w_prev_enable || z_prev_enable ); + + // Interleave non-zero and zeros into the outbuf buffer + // Increase the outbuffer to fit the new slice + int16_t *p = outbuf.reserve(w_cnt + total_zcnt); + + // Insert initial zeros + if ( ( slice_len != ETHOSU_SLICELEN_BITS || new_palette ) && use_zero_runs ) + { + for ( j = 0; j < z_value[0]; j++ ) + { + *p++ = 0; + } + } + + // Loop over all weights and insert zeros in-between + for ( i = 0; i < w_cnt; i++ ) + { + int val; + assert(w_value[i] < 512); // HW supports 9bit + if ( w_value[i] < palette.palsize ) + { + val = palette.inv_lut[w_value[i]]; + } + else + { + val = w_value[i] - palette.palsize + palette.direct_offset; + } + int sign = val & 1; + int mag = val >> 1; + *p++ = sign ? int16_t(-mag) : int16_t(mag); + if ( use_zero_runs ) + { + for ( j = 0; j < z_value[i + (new_palette ? 1 : 0)]; j++ ) + { + *p++ = 0; + } + } + } + + outbuf.use(w_cnt + total_zcnt); + } while (!single_slice); +labelExit: + return outbuf.used() - start_offset; +} + + +constexpr int INITIAL_BLOCKS = 4; + + +#if defined __cplusplus +extern "C" +{ +#endif + +// Decode a stream +// +// result - Resulting data from decode (must be freeed after use) +// buffer - Incoming bitstream buffer +// size_bytes - Size of the bitstream buffer (in bytes) +void ml_decode_ethosu_stream(ml_decode_result_t *result, const uint8_t *buffer, int size_bytes) +{ + assert(result && buffer && size_bytes); + result->decoded_data = nullptr; + result->section_sizes = nullptr; + + bitbuf_t bb(buffer, size_bytes); + raw_buffer_t output(4096, MLW_ENCODE_ALLOC_STREAM0, nullptr); + palette_t palette; + ml_decode_internal(output, bb, palette, size_bytes, false, ETHOSU_SLICELEN_BITS); + + // Populate the results set + result->decoded_length = output.used(); + result->decoded_data = output.detach(); +} + +ML_ENCODER_DLL_EXPORT void mld_free(ml_decode_result_t *result) +{ + if ( result ) + { + if ( result->decoded_data ) + { + result->decoded_data = static_cast(realloc( result->decoded_data, 0)); + } + if ( result->section_sizes ) + { + free( result->section_sizes ); + result->section_sizes = nullptr; + } + } +} + + +#if defined __cplusplus +} // extern "C" +#endif + diff --git a/ethosu/regor/dependencies/mlw_codec/source/mlw_encode.cpp b/ethosu/regor/dependencies/mlw_codec/source/mlw_encode.cpp new file mode 100644 index 00000000..3a05dab3 --- /dev/null +++ b/ethosu/regor/dependencies/mlw_codec/source/mlw_encode.cpp @@ -0,0 +1,961 @@ +// +// SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "../include/mlw_encode.h" + +#include "ml_encoder_internal.hpp" +#include "ml_raw_buffer.hpp" +#include "ml_bit_buffer.hpp" + +#include +#include +#include +#include +#include +#include + +constexpr int ZERO_FREQ_THRESHOLD = 5; +constexpr int MIN_ZERO_RUN_LENGTH = 2; + +// Create palette from the given frequencies +// Freq index 0-511 correspond to weights -256..255 +// partial_data - don't make decisions about data that will be encoded +// that wasn't included in the frequency analysis. +static void create_palette(palette_t *p, bool partial_data, bool disable_lut) +{ + uint64_t freq64[512] = {0}; + int i, all_cnt, all_max_val; + + // Pair the frequency with the value so that + // the array can be sorted on frequency while keeping + // track of the corresponding palette value + all_cnt = 0; + all_max_val = 0; + for ( i = -255; i < 256; i++ ) + { + if ( i == 0 && p->use_zero_runs ) continue; + int sign = i < 0; + int mag = abs(i); + int palval = (mag << 1) | sign; + + // Store palette value in 16 LSB bits, which will not affect the sorting + freq64[palval] = ((static_cast(p->freq[i + 256])) << 16) | palval; + all_cnt += p->freq[i + 256]; + + if ( p->freq[i + 256] > 0 ) + { + all_max_val = std::max(all_max_val, palval); + } + } + + // Cannot use direct offset with partial data. + p->only_zeros = !partial_data && (all_cnt == 0); + p->direct_offset = 0; + if ( !partial_data && (all_cnt != 0) ) + { + // Find the first non-used weight value around zero (0, -1, +1, -2, +2 etc) + for ( i = 0; i < 31; i++ ) + { + if ( (freq64[i] >> 16) != 0 ) + { + break; + } + } + p->direct_offset = i; + } + + // Sort in descending frequency order + std::sort(std::begin(freq64), std::end(freq64), [](uint64_t a, uint64_t b) { return b < a; }); + + // Check if all weights fit into the palette (and the palette is not empty) + p->only_palette = !disable_lut && !partial_data && (freq64[0] >> 16) > 0 && (freq64[32] >> 16) == 0; + + int max_palette_size; + if ( p->only_palette ) + { + max_palette_size = 32; + } + else + { + // For direct-lut we must make sure that the encoded weight + // index is not > 511. We do that by limiting the palette size + // such that the greatest value can be reached after subtracting + // the palette size. + max_palette_size = std::min(32, 511 - all_max_val); + if ( max_palette_size == 1 ) + { + max_palette_size = 0; // because palette of size 1 is not supported + } + } + + // Setup the 32 entry palette + int max_lut_val = 0, val, cnt, lut_cnt = 0; + for ( i = 0; i < max_palette_size; i++ ) + { + cnt = static_cast(freq64[i] >> 16); + val = freq64[i] & 0xffff; + // If partial data, all palette entries must be filled (even if they're wrong) + if ( cnt == 0 && !partial_data ) break; + p->lut[i] = int16_t(val); + max_lut_val = std::max(max_lut_val, val); + lut_cnt += cnt; + } + + // When all weights are the same nonzero value a palette size of 1 is possible; but not supported. + // Make the palette 2 entries long and zero the second entry (it's never indexed). + if ( i == 1 ) + { + p->lut[i++] = 0; + } + + // Heuristic for when to use the palette. If more than half of the + // weights are in the palette then we use it. This ensures we don't + // use palette for e.g. rectangular distributions. + int palbits_val; + if ( !disable_lut && (lut_cnt >= all_cnt / 2) ) + { + p->palsize = i; + palbits_val = max_lut_val; + } + else + { + // No palette + p->palsize = 0; + // If no palette, then palbits is used to specify the + // number of bits required for uncompressed mode, i.e. + // the number of bits for the greatest weight value + palbits_val = all_max_val; + } + + // the palette entry bit width + // minimum 2-bits (because PALBITS is in range 2..9) + int palbits = 2; + while ( (1 << palbits) <= palbits_val ) + { + palbits++; + } + assert(palbits <= 9); + p->palbits = palbits; +} + +static void create_inverse_palette(palette_t *p) +{ + int i; + int val = p->palsize - p->direct_offset; + for ( i = 0; i < 256; i++ ) + { + p->inv_lut[256 + i] = int16_t(val); + p->inv_lut[256 - i] = int16_t(val + 1); + val += 2; + } + p->inv_lut[0] = 0; + + for ( i = 0; i < p->palsize; i++ ) + { + val = p->lut[i]; + int sign = val & 1; + int mag = val >> 1; + int weight = sign ? -mag : mag; + assert( ((weight + 256) >= 0) && ((weight + 256) < 512) ); + p->inv_lut[weight + 256] = int16_t(i); + } +} + +// If palette_size is 512, then palette is not used (in that case the palette is setup +// with the standard alternating unsigned to signed mapping) +static void update_palette(palette_t *p, const int16_t *weights, int weights_count, bool partial_data, bool disable_lut, bool disable_zruns) +{ + int(&freq)[512] = p->freq; + + int total_zeroes = 0; + int zeroes_in_run = 0; + int zeroes_in_all_runs = 0; + + // Calculate frequencies of the given weight stream + for ( int i = 0; i < weights_count; i++ ) + { + freq[weights[i] + 256]++; + if ( weights[i] == 0 ) + { + total_zeroes++; + zeroes_in_run++; + } + else + { + if ( zeroes_in_run >= MIN_ZERO_RUN_LENGTH ) + { + zeroes_in_all_runs += zeroes_in_run; + } + zeroes_in_run = 0; + } + } + + // Detect trailing zero runs in compression + if ( zeroes_in_run >= MIN_ZERO_RUN_LENGTH ) + { + zeroes_in_all_runs += zeroes_in_run; + } + + int common_val = 0; + int common_freq = 0; + for ( int i = 0; i < 512; i++ ) + { + // Most common non-zero frequency (because we already have that) + if ( (i != 256) && freq[i] > common_freq ) + { + common_val = i - 256; + common_freq = freq[i]; + } + } + + // Decide if zero-runs (alternating mode) should be used: + // * zero runs must make up at least half of the zeroes + // * zero should be the most common symbol + // * zero should be sufficiently more common than the second most common symbol + bool use_zero_runs = zeroes_in_all_runs >= (total_zeroes / 2); + use_zero_runs &= total_zeroes > (ZERO_FREQ_THRESHOLD * common_freq); + p->use_zero_runs = use_zero_runs && !disable_zruns; + // Create the palette + create_palette(p, partial_data, disable_lut); +} + +#define NWCFG 13 +#define NZCFG 4 // restrict search to ZDIV=0..3 +#define MAX_ZWCFG ((NWCFG > NZCFG) ? NWCFG : NZCFG) + +// (trunc<<4) | div, 0x20 means uncompressed +static constexpr char w_grc_params[] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x20}; +static constexpr char z_grc_params[] = {0x00, 0x01, 0x02, 0x03}; + +struct grc_param_t +{ + int cfg = 0; + int end_pos = 0; +}; + +template +static int search_grc_params(const TYPE *inval_buf, int n_inval, int zrun_mode, int uncompressed_bits, + std::vector &result, bool single_slice) +{ + assert(uncompressed_bits < 32); + int n_cfg = zrun_mode ? NZCFG : NWCFG; + const char *grc_params = zrun_mode ? z_grc_params : w_grc_params; + + // Greedy forward-only GRC search (with optimisation for avoiding + // unusable GRC parameters). + const int cmd_cost = 40; + + int bit_count[MAX_ZWCFG] = {0}; + int reset_pos[MAX_ZWCFG] = {0}; + bool coded[MAX_ZWCFG] = {false}; + bool any_uncodable[MAX_ZWCFG] = {false}; + int active_bitcount = 0; + int active_cfg = -1; + int add_uncompressed_bits = (uncompressed_bits > 0) ? uncompressed_bits : 100; + + for ( int i = 0; i < n_inval; i++ ) + { + int value = inval_buf[i]; + + int best_bitcount = 0x7FFFFFFF; + int best_cfg = 0; + + // Loop over GRC parameters, calculate bits to code value, and then update the search state + for ( int j = 0; j < n_cfg; j++ ) + { + int div = grc_params[j] & 15; + int trunc = grc_params[j] >> 4; + int q = value >> div; + int bits = trunc ? std::min(q + 1, 2) + div : q + 1 + div; + + bool can_code = !(!zrun_mode && ((trunc && q > 2) || q > 31)); + if ( trunc == 2 ) + { + bits = add_uncompressed_bits; + can_code = true; + } + + if ( can_code ) + { + if ( !coded[j] ) + { + bit_count[j] = active_bitcount; // Reset non-coded to current best + } + bit_count[j] = bit_count[j] + bits; + + if ( bit_count[j] < best_bitcount ) + { + best_bitcount = bit_count[j]; + best_cfg = j; + } + } + else + { + reset_pos[j] = i + 1; + bit_count[j] += cmd_cost; // Would have to change away if used + } + + coded[j] = can_code; + any_uncodable[j] |= !can_code; + } + + // In single-slice mode we can check the bit counts afterwards; otherwise we record + // slice start points by tracking the minimum of the accumulting bit counts for + // different grc parameters. + if ( !single_slice ) + { + bool must_code = (active_cfg == -1) || !coded[active_cfg]; + if ( must_code || ((best_cfg != active_cfg) && (best_bitcount + cmd_cost) < active_bitcount) ) + { + // Commit non-initial changes + if ( active_cfg != -1 ) + { + // Range elision (was the other config better all along?) + if ( (bit_count[best_cfg] < bit_count[active_cfg]) && (reset_pos[best_cfg] <= reset_pos[active_cfg]) ) + { + // If the current BEST config started before the ACTIVE config in time, then switch to using the BEST config. + active_cfg = best_cfg; // Duplicated on both paths for clarity + } + else + { + // Otherwise use the ACTIVE config for this slice before switching to the BEST config. + grc_param_t param; + param.cfg = active_cfg; + param.end_pos = i; + assert((active_cfg != 12) || (uncompressed_bits != 0)); + result.push_back(param); + } + } + active_cfg = best_cfg; + } + } + else if ( active_cfg == -1 ) + { + active_cfg = best_cfg; + } + + active_bitcount = bit_count[active_cfg]; + } + + // terminate the run + if ( result.empty() || (result.back().cfg != active_cfg) ) + { + // If single slice then select the best minimum-bits configuration + if (single_slice) + { + assert( result.empty() ); + active_cfg = -1; + int max_bit_count = std::numeric_limits::max(); + for (int i=0; i < n_cfg; i++) + { + if ( !any_uncodable[i] && (bit_count[i] <= max_bit_count) ) + { + if ( (active_cfg != 12) || (uncompressed_bits != 0) ) + { + active_cfg = i; + max_bit_count = bit_count[i]; + } + } + } + assert(active_cfg != -1); // There isn't a usable grc parameter (fatal) + } + + grc_param_t param; + param.cfg = active_cfg; + assert((active_cfg != 12) || (uncompressed_bits != 0)); + result.push_back(param); + } + result.back().end_pos = n_inval; + + return active_bitcount; +} + +#if !ENABLE_DEBUG_BITSTREAM + // Release build putbits macro + #define bitbuf_put(bb_, name_, len_, data_) bb_->put(len_, data_) + #define bitbuf_align(bb_, name_, len_, data_) bb_->align(len_, data_) +#else + // Debug build putbits macro + inline void bitbuf_put(bitbuf_t *bb, const char *name, int len, int data) + { + assert(len <= 32); + int pre_pos = bb->pos(); + bb->put(len, data); + BITSTREAM_LOG("%6d %s:%d = %d\n", pre_pos, name, bb->pos()-pre_pos, data); + } + + // Debug build putbits macro + inline void bitbuf_align(bitbuf_t *bb, const char *name, int len, int data) + { + assert(len <= 32); + int pre_pos = bb->pos(); + bb->align(len, data); + BITSTREAM_LOG("%6d %s:%d = %d\n", pre_pos, name, bb->pos()-pre_pos, data); + } +#endif + +static slice_params_t encode_slice_header(mle_context_t *ctx, int slicelen, bool new_palette, int uncompressed_bits, int w_cfg, int z_cfg, bitbuf_t *bb) +{ + assert( (ctx->slicelen_bits == 15) || (ctx->slicelen_bits == 17) ); // Currently known formats + assert( slicelen < (1 << ctx->slicelen_bits) ); + assert( w_cfg >= 0 && w_cfg < (sizeof(w_grc_params)/sizeof(w_grc_params[0]))); + + // GRC parameters for this slice + int w_grc_trunc = (w_grc_params[w_cfg] >> 4) == 1; + int w_uncompressed = (w_grc_params[w_cfg] >> 4) == 2; + // Callers can signal a truly empty slice with a negative z_cfg index + assert( ((z_cfg < 0) && (slicelen == 0)) || (ctx->allow_empty_slices && slicelen >= 0) || (slicelen >= 1) ); + int z_grc_div = (z_cfg < 0) ? ZDIV_DISABLE : z_grc_params[z_cfg] & 15; + int w_grc_div = w_uncompressed ? uncompressed_bits : (w_grc_params[w_cfg] & 15); + + int zdiv = ctx->palette.use_zero_runs ? z_grc_div : ZDIV_DISABLE; + int wdiv = !w_uncompressed ? w_grc_div : WDIV_UNCOMPRESSED; + + if ( ENABLE_DEBUG_BITSTREAM ) + { + BITSTREAM_LOG("slice: bitoffset %d slicelen %d zdiv %d wdiv %d wtrunc %d newpal %d palbits %d palsize %d\n", bb->pos(), + slicelen, zdiv, wdiv, w_grc_trunc, new_palette, ctx->palette.palbits, ctx->palette.palsize); + } + + // Write slice header + bitbuf_put(bb, "ZDIV", 3, zdiv); + bitbuf_put(bb, "SLICELEN", ctx->slicelen_bits, ctx->allow_empty_slices ? slicelen : slicelen - 1); + bitbuf_put(bb, "WDIV", 3, wdiv); + bitbuf_put(bb, "WTRUNC", 1, w_grc_trunc); + bitbuf_put(bb, "NEWPAL", 1, new_palette); + if ( new_palette ) + { + bitbuf_put(bb, "DIROFS", 5, ctx->palette.direct_offset); + bitbuf_put(bb, "PALSIZE", 5, std::max(0, ctx->palette.palsize - 1)); + bitbuf_put(bb, "PALBITS", 3, ctx->palette.palbits - 2); + for (int i = 0; i < ctx->palette.palsize; i++ ) + { + bitbuf_put(bb, "PALETTE", ctx->palette.palbits, ctx->palette.lut[i]); + } + } + + slice_params_t header; + header.w_grc_trunc = w_grc_trunc; + header.w_uncompressed = w_uncompressed; + header.z_grc_div = z_grc_div; + header.w_grc_div = w_grc_div; + return header; +} + + +static void encode_slice(mle_context_t *ctx, const int16_t *w_values, int weight_count, const int32_t *z_values, int zero_count, bool new_palette, + int uncompressed_bits, int w_cfg, int z_cfg, bitbuf_t *bb) +{ + int w_cnt = weight_count; + int z_cnt = (z_values && zero_count) ? w_cnt + (new_palette ? 1 : 0) : 0; + + slice_params_t hdr = encode_slice_header(ctx, w_cnt, new_palette, uncompressed_bits, w_cfg, z_cfg, bb); + + assert(z_cfg >= 0 && "slice was signalled as truly empty"); + + // Record slice parameters for HW testbench debugging + if ( ctx->enable_slice_debug ) + { + ctx->slice_debug.push_back( mle_slice_debug_t { hdr, ctx->palette } ); + } + + int w_grc_div = hdr.w_grc_div; + bool w_uncompressed = hdr.w_uncompressed; + int z_grc_div = hdr.z_grc_div; + int w_grc_trunc = hdr.w_grc_trunc; + + int j; + int z_unary_len = z_grc_div < 3 ? 12 : 8; + int w_pos = 0, z_pos = 0; + int w_unary0 = 0, w_unary1 = 0, w_unary1_len = 0, w_q = -1, w_r = 0; + int z_unary = 0, z_q = -1, z_r = 0; + + int w_remain_data[2][12] = {{0}}; + int *w_remain = w_remain_data[0]; + int *w_prev_remain = w_remain_data[1]; + int w_nsymbols = 0; + int w_prev_enable = 0, w_prev_nsymbols = 0; + + int z_remain_data[2][12] = {{0}}; + int *z_remain = z_remain_data[0]; + int *z_prev_remain = z_remain_data[1]; + int z_nsymbols = 0; + int z_prev_enable = 0, z_prev_nsymbols = 0; + bool use_zero_runs = ctx->palette.use_zero_runs; + + do + { + int balance = use_zero_runs ? w_pos - z_pos : 0; + int w_enable = balance < 8 && w_pos < w_cnt; + int z_enable = balance >= 0 && use_zero_runs && z_pos < z_cnt; + if ( w_enable ) + { + // Encode chunk (weights) + j = 0; + w_nsymbols = 0; + w_unary0 = 0; + w_unary1 = 0; + w_unary1_len = 0; + int max_symbols = (w_uncompressed && w_grc_div > 5) ? 8 : 12; + while ( j < max_symbols ) + { + if ( w_q < 0 ) + { + if ( w_pos < w_cnt ) + { + int value = w_values[w_pos]; + assert( value >= 0 && value < 512 ); + w_q = value >> w_grc_div; + w_r = value & ((1 << w_grc_div) - 1); + assert(w_q <= 31 && (!w_grc_trunc || w_q <= 2)); + } + else + { + w_q = 0; + w_r = -1; // don't send remainder + } + } + while ( w_q >= 0 && j < max_symbols ) + { + w_unary0 |= w_q > 0 ? (1 << j) : 0; + if ( w_q > 0 ) + { + w_unary1 |= w_q > 1 ? (1 << w_unary1_len) : 0; + w_unary1_len++; + } + j++; + w_q -= 2; + if ( w_grc_trunc ) w_q--; + } + if ( w_q < 0 && w_r >= 0 ) + { + w_remain[w_nsymbols] = w_r; + w_nsymbols++; + w_pos++; + } + } + } + + if ( z_enable ) + { + // Encode chunk (zrun) + j = 0; + z_nsymbols = 0; + z_unary = 0; + while ( j < z_unary_len ) + { + if ( z_q < 0 ) + { + if ( z_pos < z_cnt ) + { + int value = z_values[z_pos]; + z_q = value >> z_grc_div; + z_r = value & ((1 << z_grc_div) - 1); + assert( z_q >= 0 ); // There are no negative length z-runs + } + else + { + z_q = 0; + z_r = -1; + } + } + while ( z_q >= 0 && j < z_unary_len ) + { + z_unary |= z_q > 0 ? (1 << j) : 0; + j++; + z_q--; + } + if ( z_q < 0 && z_r >= 0 ) + { + assert( z_nsymbols < 12 ); + z_remain[z_nsymbols] = z_r; + z_nsymbols++; + z_pos++; + } + } + } + + // Write chunk to bitstream + if ( w_enable && !w_uncompressed ) + { + bitbuf_put(bb, "WUNARY0", 12, w_unary0); // 12 bits + } + if ( z_enable ) + { + bitbuf_put(bb, "ZUNARY", z_unary_len, z_unary); // 12 or 8 bits + } + if ( w_enable && !w_uncompressed && (w_unary1_len > 0) ) + { + bitbuf_put(bb, "WUNARY1", w_unary1_len, w_unary1); // max 12 bits + } + if ( w_prev_enable ) + { + for (int i = 0; i < w_prev_nsymbols; i++ ) + { + bitbuf_put(bb, "WREMAIN", w_grc_div, w_prev_remain[i]); + } + } + if ( z_prev_enable && (z_grc_div > 0) ) + { + for (int i = 0; i < z_prev_nsymbols; i++ ) + { + bitbuf_put(bb, "ZREMAIN", z_grc_div, z_prev_remain[i]); + } + } + w_prev_enable = w_enable; + w_prev_nsymbols = w_nsymbols; + std::swap(w_prev_remain, w_remain); + z_prev_enable = z_enable; + z_prev_nsymbols = z_nsymbols; + std::swap(z_prev_remain, z_remain); + } while ( w_prev_enable || z_prev_enable ); +} + + +int ml_encode_section(mle_context_t *ctx, const int16_t *inbuf, int size, palette_t *p, bitbuf_t *bitbuf) +{ + bool new_palette = (p != nullptr); + + // Reuse previous if not specified + if ( p == nullptr ) + { + p = &ctx->palette; + } + + // Uncompressed mode can only be used if either all weights + // are in the palette OR if the palette is not used. + int uncompressed_bits = 0; + if ( p->only_palette ) + { + // Uncompressed bits derived from palette size + while ( (1 << uncompressed_bits) < p->palsize ) + { + uncompressed_bits++; + } + } + else if ( p->palsize == 0 ) + { + // Uncompressed bits is palbits (which is the bitdepth of the greatest weight) + uncompressed_bits = p->palbits; + } + + // If there are no weights at all, emit an empty slice header, then exit. + if ( size == 0 ) + { + // Signal a truly empty slice using -ve zgrc to ensure ZDIV_DISABLE is written to the stream. + if ( ctx->allow_empty_slices ) + { + encode_slice_header(ctx, 0, new_palette, uncompressed_bits, 0, -1, bitbuf); + } + return 0; + } + + std::vector weight_values; + weight_values.reserve(size); + + // If zruns was enabled, expect total to be < weight_values/2 + std::vector zrun_values; + if ( p->use_zero_runs ) + { + zrun_values.reserve( size / 4); + } + + // Get weights (or weight indicies) AND zero-runs from the input weight stream. + int i = 0; + bool allow_empty_slices = !p->only_zeros || ctx->allow_empty_slices; + int total_zcnt = 0; + const int max_slice_len = (1 << ctx->slicelen_bits) - 1; + while ( 1 ) + { + if ( p->use_zero_runs ) + { + int zcnt = 0; + // Count zero run + // Special case: if all weights in the section are zero, we must + // still ensure we have one coded weight so the the slice length + // doesn't become 0. Therefore we skip the first zero run and code + // the zero explicitly as a weight value instead + if ( allow_empty_slices || i > 0 ) + { + while ( i < size && inbuf[i] == 0 && zcnt < max_slice_len ) + { + zcnt++; + i++; + } + } + total_zcnt += zcnt; + zrun_values.push_back(zcnt); + } + if ( i == size ) break; + int16_t value = p->inv_lut[inbuf[i] + 256]; + weight_values.push_back(value); + i++; + } + + // Search for good GRC parameters for the weight stream + std::vector w_slice_cfg; + int n_weights = int(weight_values.size()); + if ( n_weights ) + { + // Use a fixed grc config index if provided (partial-data mode sets this) + if ( ctx->fixed_wgrc >= 0 ) + { + w_slice_cfg.push_back(grc_param_t{ ctx->fixed_wgrc, n_weights }); + } + else + { + search_grc_params(weight_values.data(), n_weights, 0, uncompressed_bits, w_slice_cfg, ctx->single_slice_sections); + } + } + int n_w_slice = int(w_slice_cfg.size()); + + // Search for good GRC parameters for the zrun stream + std::vector z_slice_cfg; + if ( p->use_zero_runs ) + { + // Use a fixed grc config index if provided (partial-data mode sets this) + if ( ctx->fixed_zgrc >= 0 ) + { + z_slice_cfg.push_back(grc_param_t{ ctx->fixed_zgrc, n_weights + 1 }); + } + else + { + search_grc_params(zrun_values.data(), n_weights + 1, 1, 0, z_slice_cfg, ctx->single_slice_sections); + } + } + int n_z_slice = int(z_slice_cfg.size()); + + int loops = 0; + + // Encode bitstream slice + int pos = 0, i_w_slice = 0, i_z_slice = 0; + bool only_zero_runs_pass = !zrun_values.empty(); + while ( (pos < n_weights) || new_palette || only_zero_runs_pass ) + { + int w_len = 0; + int z_len = 0; + + if ( i_w_slice < n_w_slice ) + { + w_len = w_slice_cfg[i_w_slice].end_pos - pos; + w_len = std::min(w_len, max_slice_len); + } + + if ( i_z_slice < n_z_slice ) + { + z_len = z_slice_cfg[i_z_slice].end_pos - pos; + z_len = std::min(z_len, max_slice_len); + } + + // The first slice (when new_palette is 1) encodes zero runs both at the + // beginning and end (i.e. number of zero runs are len+1). + // The following slices only encode zero runs at the end (there cannot be + // any zeros in the beginning since they are encoded by the previous slice) + const int32_t *zrun_buf = p->use_zero_runs ? zrun_values.data() + pos + !(new_palette || ctx->allow_empty_slices) : nullptr; + const int16_t *w_buf = w_len ? weight_values.data() + pos : nullptr; + int w_cfg = w_len ? w_slice_cfg[i_w_slice].cfg : 0; + int z_cfg = p->use_zero_runs ? z_slice_cfg[i_z_slice].cfg : 0; + + encode_slice(ctx, w_buf, w_len, zrun_buf, z_len, new_palette, uncompressed_bits, w_cfg, z_cfg, bitbuf); + new_palette = 0; + + if ( z_len <= 0 && w_len > 0 ) + pos += w_len; + else if ( w_len <= 0 && z_len > 0 ) + pos += z_len; + else + pos += std::min(z_len, w_len); + + if ( i_w_slice < n_w_slice && w_slice_cfg[i_w_slice].end_pos <= pos ) + { + i_w_slice++; + } + if ( i_z_slice < n_z_slice && z_slice_cfg[i_z_slice].end_pos <= pos ) + { + i_z_slice++; + } + loops++; + only_zero_runs_pass = false; + } + // Single-slice sections can only generate one slice (a single loop) + assert( !ctx->single_slice_sections || (ctx->single_slice_sections && loops == 1) ); + if ( ctx->single_slice_sections && (loops != 1) ) + { + return -1; + } + return total_zcnt; +} + + +palette_t *ml_encode_palette(mle_context_t *ctx, const int16_t *weights, int encode_count, int analyse_count, unsigned mlw_encode_flags) +{ + palette_t *palette = nullptr; + if ( !ctx->palette_valid || (mlw_encode_flags & MLW_ENCODE_INSERT_PALETTE) ) + { + if (mlw_encode_flags & MLW_ENCODE_RESET_PALETTE) + { + memset( ctx->palette.freq, 0, sizeof(ctx->palette.freq) ); + } + + bool partial_data = (mlw_encode_flags & MLW_ENCODE_PARTIAL_DATA) != 0; + bool disable_lut = (mlw_encode_flags & MLW_ENCODE_NO_PALETTE_LUT) != 0; + bool disable_zruns = (mlw_encode_flags & MLW_ENCODE_NO_ZERO_RUNS) != 0; + + assert( analyse_count >= encode_count && "Must analyse at least as much as is encoded"); + + update_palette(&ctx->palette, weights, analyse_count, partial_data, disable_lut, disable_zruns); + ctx->palette_valid = true; + if ( !(mlw_encode_flags & MLW_ENCODE_DPIC_FORCE_PARAMS) ) + { + ctx->fixed_wgrc = (partial_data) ? 5 : -1; + ctx->fixed_zgrc = (partial_data) ? 3 : -1; + } + + create_inverse_palette(&ctx->palette); + palette = &ctx->palette; + } + return palette; +} + +void ml_encode_eos(mle_context_t *ctx, bitbuf_t &bits, unsigned mlw_encode_flags) +{ + // Add end of stream marker and align to 128bit + bitbuf_t *bb = &bits; + if ( ctx->eos_required ) + { + bitbuf_put(bb, "ZDIV", 3, ZDIV_EOS); + } + bitbuf_align(bb, "BYTEALIGN", 8, 0xff); + + if ( !(mlw_encode_flags & MLW_ENCODE_NO_PADDING) ) + { + bb->align( 128, 0xFF ); + } + bb->flush(); +} + +int ml_encode_internal(mle_context_t *ctx, bitbuf_t &bits, const int16_t *weights, int encode_count, int analyse_count, unsigned mlw_encode_flags) +{ + palette_t *palette = ml_encode_palette(ctx, weights, encode_count, analyse_count, mlw_encode_flags); + + int zresult = ml_encode_section(ctx, weights, encode_count, palette, &bits); + if ( zresult < 0 ) + { + return -1; + } + ctx->zero_count += zresult; + return 0; +} + +extern "C" +{ + +ML_ENCODER_DLL_EXPORT mle_context_t *mle_create_context(int32_t syntax) +{ + mle_context_t *ctx = new mle_context_t; + ctx->zero_count = 0; + ctx->syntax = syntax; + ctx->realloc_func = nullptr; + if (syntax == MLW_ENCODE_SYNTAX_ETHOSU) + { + ctx->slicelen_bits = ETHOSU_SLICELEN_BITS; + ctx->allow_empty_slices = false; + ctx->single_slice_sections = false; + ctx->eos_required = true; + } + else if (syntax == MLW_ENCODE_SYNTAX_ETHOSU_FWD) + { + ctx->slicelen_bits = 0; + } + else + { + assert(false && "bad syntax"); + delete ctx; + return nullptr; + } + return ctx; +} + +ML_ENCODER_DLL_EXPORT int mle_context_query_zeroes(mle_context_t *ctx) +{ + assert( ctx ); + return ctx->zero_count; +} + +ML_ENCODER_DLL_EXPORT void mle_context_set_allocator(mle_context_t *ctx, void* (*realloc_func)(void*, size_t, int)) +{ + assert( ctx ); + ctx->realloc_func = realloc_func; +} + +ML_ENCODER_DLL_EXPORT void mle_destroy_context(mle_context_t *ctx) +{ + assert(ctx); + delete ctx; +} + +ML_ENCODER_DLL_EXPORT int mle_encode(mle_context_t *ctx, ml_encode_result_t *result, const int16_t *inbuf, int inbuf_size, unsigned mlw_encode_flags) +{ + assert( ctx && result ); + raw_buffer_t output(4096, MLW_ENCODE_ALLOC_STREAM0, ctx->realloc_func); + bitbuf_t bits(output, 4096, mlw_encode_flags & MLW_ENCODE_NO_BITSTREAM); + int written = 0; + + if ( ctx->syntax == MLW_ENCODE_SYNTAX_ETHOSU_FWD ) + { + written = ml_encode_fwd(ctx, bits, inbuf, inbuf_size, mlw_encode_flags); + } + else + { + int start = bits.byte_pos(); + if ( ml_encode_internal(ctx, bits, inbuf, inbuf_size, inbuf_size, mlw_encode_flags) < 0 ) + { + return -1; + } + ml_encode_eos(ctx, bits, mlw_encode_flags); + written = bits.byte_pos() - start; + } + + if ( written >= 0 ) + { + result->encoded_data = output.detach(); + result->encoded_length = written; + result->section_info = nullptr; + result->section_count = 0; + } + return written; +} + +ML_ENCODER_DLL_EXPORT void mle_free(ml_encode_result_t *result) +{ + if ( result ) + { + if ( result->encoded_data ) + { + free( result->encoded_data ); + result->encoded_data = nullptr; + } + if ( result->section_info ) + { + free( result->section_info ); + result->section_info = nullptr; + } + } +} + +} diff --git a/ethosu/regor/dependencies/mlw_codec/source/mlw_encode_fwd.cpp b/ethosu/regor/dependencies/mlw_codec/source/mlw_encode_fwd.cpp new file mode 100644 index 00000000..c0bcc101 --- /dev/null +++ b/ethosu/regor/dependencies/mlw_codec/source/mlw_encode_fwd.cpp @@ -0,0 +1,197 @@ +// +// SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +// +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the License); you may +// not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an AS IS BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "../include/mlw_encode.h" + +#include "ml_encoder_internal.hpp" +#include "ml_raw_buffer.hpp" +#include "ml_bit_buffer.hpp" + +#include +#include +#include +#include +#include +#include + +constexpr static int LUT_MAX = 16; +constexpr static int INV_MAX = 512; + +struct fwd_header_t +{ + bool raw_mode_flag = false; + bool small_lut_flag = false; + int8_t zero_adjust = 0; + int16_t lut[LUT_MAX] = {}; + int8_t inv_index[INV_MAX] = {}; + fwd_header_t() + { + std::fill_n(inv_index, INV_MAX, -1); + } +}; + + +static inline int32_t fold(int32_t value) +{ + // Fold into positive value (sign in lsb) + return (abs(value) << 1) | (uint32_t(value) >> 31); +} + + +void fwd_emit_header(bitbuf_t &bits, const fwd_header_t &hdr) +{ + bits.put( 1, hdr.raw_mode_flag ? 1 : 0 ); + bits.put( 1, hdr.small_lut_flag ? 1 : 0 ); + bits.fill( 102, 0 ); + bits.put_masked( 8, hdr.zero_adjust ); + for (int i = 0; i < LUT_MAX; i++) + { + bits.put( 9, fold(hdr.lut[i]) ); + } +} + + +bool fwd_analyse(fwd_header_t &hdr, const int16_t *weights, int count, bitbuf_t &bits) +{ + int range_min = 1000; + int range_max = -1000; + int8_t *inv = hdr.inv_index; + int16_t *lut = hdr.lut; + int lut_used = 0; + bool use_lut = true; + + // Must check all the zero-point-correct values for full range + for (int i = 0; i < count; i++) + { + int value = weights[i]; + range_min = std::min( range_min, value ); + range_max = std::max( range_max, value ); + + // Update the LUT only while it's still viable (predicts well). + if ( use_lut ) + { + // Map the signed value to the LUT via +ve indexed table + int idx = fold(value); + assert( idx < INV_MAX ); + + // Check if value has already been indexed before adding a + // new lut entry. + if ( inv[idx] < 0 ) + { + if ( lut_used < LUT_MAX ) + { + inv[idx] = lut_used; + lut[lut_used] = value; + lut_used++; + } + else + { + use_lut = false; // LUT was full and is now unusable + } + } + // While lut2 is valid, encode the entries. When we're + // done the bitstream will be ready. + if (lut_used <= 4) + { + bits.put(2, inv[idx]); + } + } + } + + hdr.raw_mode_flag = !use_lut; + hdr.small_lut_flag = (lut_used <= 4); + hdr.zero_adjust = 0; + + // If raw mode, calculate the zero point + if ( hdr.raw_mode_flag ) + { + int full_range = (range_max - range_min); + if (full_range >= 256) + { + return false; // Can't encode this stream + } + else if ( range_min < -128 ) + { + hdr.zero_adjust = -128 - range_min; // Raw values need offsetting +ve by this amount + } + else if ( range_max > 127 ) + { + hdr.zero_adjust = 127 - range_max; // Raw values need offsetting -ve by this amount + } + } + + return (range_min >= -256) && (range_max < 256); +} + +// Encode zero-corrected weight values in the optimal fast-weight format. +int ml_encode_fwd(mle_context_t *ctx, bitbuf_t &bits, const int16_t *weights, int count, unsigned mlw_encode_flags) +{ + fwd_header_t header; + int pos = bits.pos(); + bits.fill(256, 0); // Reserve space for header + + // Encode lut2 weights directly to the main stream while analysing + if ( !fwd_analyse(header, weights, count, bits) ) + { + return -1; // Encoding error + } + + // Check for forced no palette + if ( mlw_encode_flags & MLW_ENCODE_NO_PALETTE_LUT ) + { + header.raw_mode_flag = 1; + header.small_lut_flag = 0; + } + + // Use a substream of the main stream for the header + bitbuf_t hdr_bits(bits, pos); + fwd_emit_header(hdr_bits, header); + bits.sync(hdr_bits); + + // LUT2 + if ( header.small_lut_flag ) + { + assert( !header.raw_mode_flag ); + } + // RAW + else if ( header.raw_mode_flag ) + { + bits.reposition(pos + 256); + for (int i=0; i < count; i++) + { + int value = (weights[i] + header.zero_adjust) & 0xFF; + bits.put(8, value); + } + } + // LUT4 + else + { + bits.reposition(pos + 256); + for (int i=0; i < count; i++) + { + int idx = fold(weights[i]); + bits.put(4, header.inv_index[idx]); + } + } + + bits.align(256, 0); + bits.flush(); + + int written = bits.pos() / 8; + return written; +} diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/BUILD.bazel b/ethosu/regor/dependencies/thirdparty/Catch2/BUILD.bazel new file mode 100644 index 00000000..c51bf57e --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/BUILD.bazel @@ -0,0 +1,95 @@ +load("@bazel_skylib//rules:expand_template.bzl", "expand_template") + +expand_template( + name = "catch_user_config", + out = "catch2/catch_user_config.hpp", + substitutions = { + "@CATCH_CONFIG_CONSOLE_WIDTH@": "80", + "@CATCH_CONFIG_DEFAULT_REPORTER@": "console", + "#cmakedefine CATCH_CONFIG_ANDROID_LOGWRITE": "", + "#cmakedefine CATCH_CONFIG_BAZEL_SUPPORT": "#define CATCH_CONFIG_BAZEL_SUPPORT", + "#cmakedefine CATCH_CONFIG_COLOUR_WIN32": "", + "#cmakedefine CATCH_CONFIG_COUNTER": "", + "#cmakedefine CATCH_CONFIG_CPP11_TO_STRING": "", + "#cmakedefine CATCH_CONFIG_CPP17_BYTE": "", + "#cmakedefine CATCH_CONFIG_CPP17_OPTIONAL": "", + "#cmakedefine CATCH_CONFIG_CPP17_STRING_VIEW": "", + "#cmakedefine CATCH_CONFIG_CPP17_UNCAUGHT_EXCEPTIONS": "", + "#cmakedefine CATCH_CONFIG_CPP17_VARIANT": "", + "#cmakedefine CATCH_CONFIG_DISABLE_EXCEPTIONS_CUSTOM_HANDLER": "", + "#cmakedefine CATCH_CONFIG_DISABLE_EXCEPTIONS": "", + "#cmakedefine CATCH_CONFIG_DISABLE_STRINGIFICATION": "", + "#cmakedefine CATCH_CONFIG_DISABLE": "", + "#cmakedefine CATCH_CONFIG_ENABLE_ALL_STRINGMAKERS": "", + "#cmakedefine CATCH_CONFIG_ENABLE_OPTIONAL_STRINGMAKER": "", + "#cmakedefine CATCH_CONFIG_ENABLE_PAIR_STRINGMAKER": "", + "#cmakedefine CATCH_CONFIG_ENABLE_TUPLE_STRINGMAKER": "", + "#cmakedefine CATCH_CONFIG_ENABLE_VARIANT_STRINGMAKER": "", + "#cmakedefine CATCH_CONFIG_EXPERIMENTAL_REDIRECT": "", + "#cmakedefine CATCH_CONFIG_FALLBACK_STRINGIFIER @CATCH_CONFIG_FALLBACK_STRINGIFIER@": "", + "#cmakedefine CATCH_CONFIG_FAST_COMPILE": "", + "#cmakedefine CATCH_CONFIG_GETENV": "", + "#cmakedefine CATCH_CONFIG_GLOBAL_NEXTAFTER": "", + "#cmakedefine CATCH_CONFIG_NO_ANDROID_LOGWRITE": "", + "#cmakedefine CATCH_CONFIG_NO_COLOUR_WIN32": "", + "#cmakedefine CATCH_CONFIG_NO_COUNTER": "", + "#cmakedefine CATCH_CONFIG_NO_CPP11_TO_STRING": "", + "#cmakedefine CATCH_CONFIG_NO_CPP17_BYTE": "", + "#cmakedefine CATCH_CONFIG_NO_CPP17_OPTIONAL": "", + "#cmakedefine CATCH_CONFIG_NO_CPP17_STRING_VIEW": "", + "#cmakedefine CATCH_CONFIG_NO_CPP17_UNCAUGHT_EXCEPTIONS": "", + "#cmakedefine CATCH_CONFIG_NO_CPP17_VARIANT": "", + "#cmakedefine CATCH_CONFIG_NO_GETENV": "", + "#cmakedefine CATCH_CONFIG_NO_GLOBAL_NEXTAFTER": "", + "#cmakedefine CATCH_CONFIG_NO_POSIX_SIGNALS": "", + "#cmakedefine CATCH_CONFIG_NO_USE_ASYNC": "", + "#cmakedefine CATCH_CONFIG_NO_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT": "", + "#cmakedefine CATCH_CONFIG_NO_WCHAR": "", + "#cmakedefine CATCH_CONFIG_NO_WINDOWS_SEH": "", + "#cmakedefine CATCH_CONFIG_NOSTDOUT": "", + "#cmakedefine CATCH_CONFIG_POSIX_SIGNALS": "", + "#cmakedefine CATCH_CONFIG_PREFIX_ALL": "", + "#cmakedefine CATCH_CONFIG_PREFIX_MESSAGES": "", + "#cmakedefine CATCH_CONFIG_SHARED_LIBRARY": "", + "#cmakedefine CATCH_CONFIG_EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT": "", + "#cmakedefine CATCH_CONFIG_USE_ASYNC": "", + "#cmakedefine CATCH_CONFIG_WCHAR": "", + "#cmakedefine CATCH_CONFIG_WINDOWS_CRTDBG": "", + "#cmakedefine CATCH_CONFIG_WINDOWS_SEH": "", + }, + template = "src/catch2/catch_user_config.hpp.in", +) + +# Generated header library, modifies the include prefix to account for +# generation path so that we can include +# correctly. +cc_library( + name = "catch2_generated", + hdrs = ["catch2/catch_user_config.hpp"], + include_prefix = ".", # to manipulate -I of dependenices + visibility = ["//visibility:public"], +) + +# Static library, without main. +cc_library( + name = "catch2", + srcs = glob( + ["src/catch2/**/*.cpp"], + exclude = ["src/catch2/internal/catch_main.cpp"], + ), + hdrs = glob(["src/catch2/**/*.hpp"]), + includes = ["src/"], + linkstatic = True, + visibility = ["//visibility:public"], + deps = [":catch2_generated"], +) + +# Static library, with main. +cc_library( + name = "catch2_main", + srcs = ["src/catch2/internal/catch_main.cpp"], + includes = ["src/"], + linkstatic = True, + visibility = ["//visibility:public"], + deps = [":catch2"], +) diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Catch2Config.cmake.in b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Catch2Config.cmake.in new file mode 100644 index 00000000..c485219c --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Catch2Config.cmake.in @@ -0,0 +1,10 @@ +@PACKAGE_INIT@ + + +# Avoid repeatedly including the targets +if(NOT TARGET Catch2::Catch2) + # Provide path for scripts + list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}") + + include(${CMAKE_CURRENT_LIST_DIR}/Catch2Targets.cmake) +endif() diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchConfigOptions.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchConfigOptions.cmake new file mode 100644 index 00000000..6eae220d --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchConfigOptions.cmake @@ -0,0 +1,89 @@ + +# Copyright Catch2 Authors +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE.txt or copy at +# https://www.boost.org/LICENSE_1_0.txt) + +# SPDX-License-Identifier: BSL-1.0 + +## +# This file contains options that are materialized into the Catch2 +# compiled library. All of them default to OFF, as even the positive +# forms correspond to the user _forcing_ them to ON, while being OFF +# means that Catch2 can use its own autodetection. +# +# For detailed docs look into docs/configuration.md + + +macro(AddOverridableConfigOption OptionBaseName) + option(CATCH_CONFIG_${OptionBaseName} "Read docs/configuration.md for details" OFF) + option(CATCH_CONFIG_NO_${OptionBaseName} "Read docs/configuration.md for details" OFF) + mark_as_advanced(CATCH_CONFIG_${OptionBaseName} CATCH_CONFIG_NO_${OptionBaseName}) +endmacro() + +macro(AddConfigOption OptionBaseName) + option(CATCH_CONFIG_${OptionBaseName} "Read docs/configuration.md for details" OFF) + mark_as_advanced(CATCH_CONFIG_${OptionBaseName}) +endmacro() + +set(_OverridableOptions + "ANDROID_LOGWRITE" + "BAZEL_SUPPORT" + "COLOUR_WIN32" + "COUNTER" + "CPP11_TO_STRING" + "CPP17_BYTE" + "CPP17_OPTIONAL" + "CPP17_STRING_VIEW" + "CPP17_UNCAUGHT_EXCEPTIONS" + "CPP17_VARIANT" + "GLOBAL_NEXTAFTER" + "POSIX_SIGNALS" + "USE_ASYNC" + "WCHAR" + "WINDOWS_SEH" + "GETENV" + "EXPERIMENTAL_STATIC_ANALYSIS_SUPPORT" +) + +foreach(OptionName ${_OverridableOptions}) + AddOverridableConfigOption(${OptionName}) +endforeach() + +set(_OtherConfigOptions + "DISABLE_EXCEPTIONS" + "DISABLE_EXCEPTIONS_CUSTOM_HANDLER" + "DISABLE" + "DISABLE_STRINGIFICATION" + "ENABLE_ALL_STRINGMAKERS" + "ENABLE_OPTIONAL_STRINGMAKER" + "ENABLE_PAIR_STRINGMAKER" + "ENABLE_TUPLE_STRINGMAKER" + "ENABLE_VARIANT_STRINGMAKER" + "EXPERIMENTAL_REDIRECT" + "FAST_COMPILE" + "NOSTDOUT" + "PREFIX_ALL" + "PREFIX_MESSAGES" + "WINDOWS_CRTDBG" +) + + +foreach(OptionName ${_OtherConfigOptions}) + AddConfigOption(${OptionName}) +endforeach() +if(DEFINED BUILD_SHARED_LIBS) + set(CATCH_CONFIG_SHARED_LIBRARY ${BUILD_SHARED_LIBS}) +else() + set(CATCH_CONFIG_SHARED_LIBRARY "") +endif() + +set(CATCH_CONFIG_DEFAULT_REPORTER "console" CACHE STRING "Read docs/configuration.md for details. The name of the reporter should be without quotes.") +set(CATCH_CONFIG_CONSOLE_WIDTH "80" CACHE STRING "Read docs/configuration.md for details. Must form a valid integer literal.") + +mark_as_advanced(CATCH_CONFIG_SHARED_LIBRARY CATCH_CONFIG_DEFAULT_REPORTER CATCH_CONFIG_CONSOLE_WIDTH) + +# There is no good way to both turn this into a CMake cache variable, +# and keep reasonable default semantics inside the project. Thus we do +# not define it and users have to provide it as an outside variable. +#set(CATCH_CONFIG_FALLBACK_STRINGIFIER "" CACHE STRING "Read docs/configuration.md for details.") diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchMiscFunctions.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchMiscFunctions.cmake new file mode 100644 index 00000000..84bd7cc7 --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/CatchMiscFunctions.cmake @@ -0,0 +1,121 @@ + +# Copyright Catch2 Authors +# Distributed under the Boost Software License, Version 1.0. +# (See accompanying file LICENSE.txt or copy at +# https://www.boost.org/LICENSE_1_0.txt) + +# SPDX-License-Identifier: BSL-1.0 + +include(CheckCXXCompilerFlag) +function(add_cxx_flag_if_supported_to_targets flagname targets) + string(MAKE_C_IDENTIFIER ${flagname} flag_identifier ) + check_cxx_compiler_flag("${flagname}" HAVE_FLAG_${flag_identifier}) + + if (HAVE_FLAG_${flag_identifier}) + foreach(target ${targets}) + target_compile_options(${target} PRIVATE ${flagname}) + endforeach() + endif() +endfunction() + +# Assumes that it is only called for development builds, where warnings +# and Werror is desired, so it also enables Werror. +function(add_warnings_to_targets targets) + LIST(LENGTH targets TARGETS_LEN) + # For now we just assume 2 possibilities: msvc and msvc-like compilers, + # and other. + if (MSVC) + foreach(target ${targets}) + # Force MSVC to consider everything as encoded in utf-8 + target_compile_options( ${target} PRIVATE /utf-8 ) + # Enable Werror equivalent + if (CATCH_ENABLE_WERROR) + target_compile_options( ${target} PRIVATE /WX ) + endif() + + # MSVC is currently handled specially + if ( CMAKE_CXX_COMPILER_ID MATCHES "MSVC" ) + STRING(REGEX REPLACE "/W[0-9]" "/W4" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) # override default warning level + target_compile_options( ${target} PRIVATE /w44265 /w44061 /w44062 /w45038 ) + endif() + endforeach() + + endif() + + if (NOT MSVC) + set(CHECKED_WARNING_FLAGS + "-Wabsolute-value" + "-Wall" + "-Wcall-to-pure-virtual-from-ctor-dtor" + "-Wcast-align" + "-Wcatch-value" + "-Wdangling" + "-Wdeprecated" + "-Wdeprecated-register" + "-Wexceptions" + "-Wexit-time-destructors" + "-Wextra" + "-Wextra-semi" + "-Wfloat-equal" + "-Wglobal-constructors" + "-Winit-self" + "-Wmisleading-indentation" + "-Wmismatched-new-delete" + "-Wmismatched-return-types" + "-Wmismatched-tags" + "-Wmissing-braces" + "-Wmissing-declarations" + "-Wmissing-noreturn" + "-Wmissing-prototypes" + "-Wmissing-variable-declarations" + "-Wnull-dereference" + "-Wold-style-cast" + "-Woverloaded-virtual" + "-Wparentheses" + "-Wpedantic" + "-Wredundant-decls" + "-Wreorder" + "-Wreturn-std-move" + "-Wshadow" + "-Wstrict-aliasing" + "-Wsubobject-linkage" + "-Wsuggest-destructor-override" + "-Wsuggest-override" + "-Wundef" + "-Wuninitialized" + "-Wunneeded-internal-declaration" + "-Wunreachable-code-aggressive" + "-Wunused" + "-Wunused-function" + "-Wunused-parameter" + "-Wvla" + "-Wweak-vtables" + + # This is a useful warning, but our tests sometimes rely on + # functions being present, but not picked (e.g. various checks + # for stringification implementation ordering). + # Ergo, we should use it every now and then, but we cannot + # enable it by default. + # "-Wunused-member-function" + ) + foreach(warning ${CHECKED_WARNING_FLAGS}) + add_cxx_flag_if_supported_to_targets(${warning} "${targets}") + endforeach() + + if (CATCH_ENABLE_WERROR) + foreach(target ${targets}) + # Enable Werror equivalent + target_compile_options( ${target} PRIVATE -Werror ) + endforeach() + endif() + endif() +endfunction() + +# Adds flags required for reproducible build to the target +# Currently only supports GCC and Clang +function(add_build_reproducibility_settings target) + # Make the build reproducible on versions of g++ and clang that supports -ffile-prefix-map + if((CMAKE_CXX_COMPILER_ID STREQUAL "GNU") OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang")) + add_cxx_flag_if_supported_to_targets("-ffile-prefix-map=${CATCH_DIR}/=" "${target}") + endif() +endfunction() diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindGcov.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindGcov.cmake new file mode 100644 index 00000000..41417113 --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindGcov.cmake @@ -0,0 +1,157 @@ +# This file is part of CMake-codecov. +# +# Copyright (c) +# 2015-2017 RWTH Aachen University, Federal Republic of Germany +# +# See the LICENSE file in the package base directory for details +# +# Written by Alexander Haase, alexander.haase@rwth-aachen.de +# + + +# include required Modules +include(FindPackageHandleStandardArgs) + + +# Search for gcov binary. +set(CMAKE_REQUIRED_QUIET_SAVE ${CMAKE_REQUIRED_QUIET}) +set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY}) + +get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) +foreach (LANG ${ENABLED_LANGUAGES}) + # Gcov evaluation is dependent on the used compiler. Check gcov support for + # each compiler that is used. If gcov binary was already found for this + # compiler, do not try to find it again. + if (NOT GCOV_${CMAKE_${LANG}_COMPILER_ID}_BIN) + get_filename_component(COMPILER_PATH "${CMAKE_${LANG}_COMPILER}" PATH) + + if ("${CMAKE_${LANG}_COMPILER_ID}" STREQUAL "GNU") + # Some distributions like OSX (homebrew) ship gcov with the compiler + # version appended as gcov-x. To find this binary we'll build the + # suggested binary name with the compiler version. + string(REGEX MATCH "^[0-9]+" GCC_VERSION + "${CMAKE_${LANG}_COMPILER_VERSION}") + + find_program(GCOV_BIN NAMES gcov-${GCC_VERSION} gcov + HINTS ${COMPILER_PATH}) + + elseif ("${CMAKE_${LANG}_COMPILER_ID}" STREQUAL "Clang") + # Some distributions like Debian ship llvm-cov with the compiler + # version appended as llvm-cov-x.y. To find this binary we'll build + # the suggested binary name with the compiler version. + string(REGEX MATCH "^[0-9]+.[0-9]+" LLVM_VERSION + "${CMAKE_${LANG}_COMPILER_VERSION}") + + # llvm-cov prior version 3.5 seems to be not working with coverage + # evaluation tools, but these versions are compatible with the gcc + # gcov tool. + if(LLVM_VERSION VERSION_GREATER 3.4) + find_program(LLVM_COV_BIN NAMES "llvm-cov-${LLVM_VERSION}" + "llvm-cov" HINTS ${COMPILER_PATH}) + mark_as_advanced(LLVM_COV_BIN) + + if (LLVM_COV_BIN) + find_program(LLVM_COV_WRAPPER "llvm-cov-wrapper" PATHS + ${CMAKE_MODULE_PATH}) + if (LLVM_COV_WRAPPER) + set(GCOV_BIN "${LLVM_COV_WRAPPER}" CACHE FILEPATH "") + + # set additional parameters + set(GCOV_${CMAKE_${LANG}_COMPILER_ID}_ENV + "LLVM_COV_BIN=${LLVM_COV_BIN}" CACHE STRING + "Environment variables for llvm-cov-wrapper.") + mark_as_advanced(GCOV_${CMAKE_${LANG}_COMPILER_ID}_ENV) + endif () + endif () + endif () + + if (NOT GCOV_BIN) + # Fall back to gcov binary if llvm-cov was not found or is + # incompatible. This is the default on OSX, but may crash on + # recent Linux versions. + find_program(GCOV_BIN gcov HINTS ${COMPILER_PATH}) + endif () + endif () + + + if (GCOV_BIN) + set(GCOV_${CMAKE_${LANG}_COMPILER_ID}_BIN "${GCOV_BIN}" CACHE STRING + "${LANG} gcov binary.") + + if (NOT CMAKE_REQUIRED_QUIET) + message("-- Found gcov evaluation for " + "${CMAKE_${LANG}_COMPILER_ID}: ${GCOV_BIN}") + endif() + + unset(GCOV_BIN CACHE) + endif () + endif () +endforeach () + + + + +# Add a new global target for all gcov targets. This target could be used to +# generate the gcov files for the whole project instead of calling -gcov +# for each target. +if (NOT TARGET gcov) + add_custom_target(gcov) +endif (NOT TARGET gcov) + + + +# This function will add gcov evaluation for target . Only sources of +# this target will be evaluated and no dependencies will be added. It will call +# Gcov on any source file of once and store the gcov file in the same +# directory. +function (add_gcov_target TNAME) + set(TDIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${TNAME}.dir) + + # We don't have to check, if the target has support for coverage, thus this + # will be checked by add_coverage_target in Findcoverage.cmake. Instead we + # have to determine which gcov binary to use. + get_target_property(TSOURCES ${TNAME} SOURCES) + set(SOURCES "") + set(TCOMPILER "") + foreach (FILE ${TSOURCES}) + codecov_path_of_source(${FILE} FILE) + if (NOT "${FILE}" STREQUAL "") + codecov_lang_of_source(${FILE} LANG) + if (NOT "${LANG}" STREQUAL "") + list(APPEND SOURCES "${FILE}") + set(TCOMPILER ${CMAKE_${LANG}_COMPILER_ID}) + endif () + endif () + endforeach () + + # If no gcov binary was found, coverage data can't be evaluated. + if (NOT GCOV_${TCOMPILER}_BIN) + message(WARNING "No coverage evaluation binary found for ${TCOMPILER}.") + return() + endif () + + set(GCOV_BIN "${GCOV_${TCOMPILER}_BIN}") + set(GCOV_ENV "${GCOV_${TCOMPILER}_ENV}") + + + set(BUFFER "") + foreach(FILE ${SOURCES}) + get_filename_component(FILE_PATH "${TDIR}/${FILE}" PATH) + + # call gcov + add_custom_command(OUTPUT ${TDIR}/${FILE}.gcov + COMMAND ${GCOV_ENV} ${GCOV_BIN} ${TDIR}/${FILE}.gcno > /dev/null + DEPENDS ${TNAME} ${TDIR}/${FILE}.gcno + WORKING_DIRECTORY ${FILE_PATH} + ) + + list(APPEND BUFFER ${TDIR}/${FILE}.gcov) + endforeach() + + + # add target for gcov evaluation of + add_custom_target(${TNAME}-gcov DEPENDS ${BUFFER}) + + # add evaluation target to the global gcov target. + add_dependencies(gcov ${TNAME}-gcov) +endfunction (add_gcov_target) diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindLcov.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindLcov.cmake new file mode 100644 index 00000000..beb925ae --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/FindLcov.cmake @@ -0,0 +1,354 @@ +# This file is part of CMake-codecov. +# +# Copyright (c) +# 2015-2017 RWTH Aachen University, Federal Republic of Germany +# +# See the LICENSE file in the package base directory for details +# +# Written by Alexander Haase, alexander.haase@rwth-aachen.de +# + + +# configuration +set(LCOV_DATA_PATH "${CMAKE_BINARY_DIR}/lcov/data") +set(LCOV_DATA_PATH_INIT "${LCOV_DATA_PATH}/init") +set(LCOV_DATA_PATH_CAPTURE "${LCOV_DATA_PATH}/capture") +set(LCOV_HTML_PATH "${CMAKE_BINARY_DIR}/lcov/html") + + + + +# Search for Gcov which is used by Lcov. +find_package(Gcov) + + + + +# This function will add lcov evaluation for target . Only sources of +# this target will be evaluated and no dependencies will be added. It will call +# geninfo on any source file of once and store the info file in the same +# directory. +# +# Note: This function is only a wrapper to define this function always, even if +# coverage is not supported by the compiler or disabled. This function must +# be defined here, because the module will be exited, if there is no coverage +# support by the compiler or it is disabled by the user. +function (add_lcov_target TNAME) + if (LCOV_FOUND) + # capture initial coverage data + lcov_capture_initial_tgt(${TNAME}) + + # capture coverage data after execution + lcov_capture_tgt(${TNAME}) + endif () +endfunction (add_lcov_target) + + + + +# include required Modules +include(FindPackageHandleStandardArgs) + +# Search for required lcov binaries. +find_program(LCOV_BIN lcov) +find_program(GENINFO_BIN geninfo) +find_program(GENHTML_BIN genhtml) +find_package_handle_standard_args(lcov + REQUIRED_VARS LCOV_BIN GENINFO_BIN GENHTML_BIN +) + +# enable genhtml C++ demangeling, if c++filt is found. +set(GENHTML_CPPFILT_FLAG "") +find_program(CPPFILT_BIN c++filt) +if (NOT CPPFILT_BIN STREQUAL "") + set(GENHTML_CPPFILT_FLAG "--demangle-cpp") +endif (NOT CPPFILT_BIN STREQUAL "") + +# enable no-external flag for lcov, if available. +if (GENINFO_BIN AND NOT DEFINED GENINFO_EXTERN_FLAG) + set(FLAG "") + execute_process(COMMAND ${GENINFO_BIN} --help OUTPUT_VARIABLE GENINFO_HELP) + string(REGEX MATCH "external" GENINFO_RES "${GENINFO_HELP}") + if (GENINFO_RES) + set(FLAG "--no-external") + endif () + + set(GENINFO_EXTERN_FLAG "${FLAG}" + CACHE STRING "Geninfo flag to exclude system sources.") +endif () + +# If Lcov was not found, exit module now. +if (NOT LCOV_FOUND) + return() +endif (NOT LCOV_FOUND) + + + + +# Create directories to be used. +file(MAKE_DIRECTORY ${LCOV_DATA_PATH_INIT}) +file(MAKE_DIRECTORY ${LCOV_DATA_PATH_CAPTURE}) + +set(LCOV_REMOVE_PATTERNS "") + +# This function will merge lcov files to a single target file. Additional lcov +# flags may be set with setting LCOV_EXTRA_FLAGS before calling this function. +function (lcov_merge_files OUTFILE ...) + # Remove ${OUTFILE} from ${ARGV} and generate lcov parameters with files. + list(REMOVE_AT ARGV 0) + + # Generate merged file. + string(REPLACE "${CMAKE_BINARY_DIR}/" "" FILE_REL "${OUTFILE}") + add_custom_command(OUTPUT "${OUTFILE}.raw" + COMMAND cat ${ARGV} > ${OUTFILE}.raw + DEPENDS ${ARGV} + COMMENT "Generating ${FILE_REL}" + ) + + add_custom_command(OUTPUT "${OUTFILE}" + COMMAND ${LCOV_BIN} --quiet -a ${OUTFILE}.raw --output-file ${OUTFILE} + --base-directory ${PROJECT_SOURCE_DIR} ${LCOV_EXTRA_FLAGS} + COMMAND ${LCOV_BIN} --quiet -r ${OUTFILE} ${LCOV_REMOVE_PATTERNS} + --output-file ${OUTFILE} ${LCOV_EXTRA_FLAGS} + DEPENDS ${OUTFILE}.raw + COMMENT "Post-processing ${FILE_REL}" + ) +endfunction () + + + + +# Add a new global target to generate initial coverage reports for all targets. +# This target will be used to generate the global initial info file, which is +# used to gather even empty report data. +if (NOT TARGET lcov-capture-init) + add_custom_target(lcov-capture-init) + set(LCOV_CAPTURE_INIT_FILES "" CACHE INTERNAL "") +endif (NOT TARGET lcov-capture-init) + + +# This function will add initial capture of coverage data for target , +# which is needed to get also data for objects, which were not loaded at +# execution time. It will call geninfo for every source file of once and +# store the info file in the same directory. +function (lcov_capture_initial_tgt TNAME) + # We don't have to check, if the target has support for coverage, thus this + # will be checked by add_coverage_target in Findcoverage.cmake. Instead we + # have to determine which gcov binary to use. + get_target_property(TSOURCES ${TNAME} SOURCES) + set(SOURCES "") + set(TCOMPILER "") + foreach (FILE ${TSOURCES}) + codecov_path_of_source(${FILE} FILE) + if (NOT "${FILE}" STREQUAL "") + codecov_lang_of_source(${FILE} LANG) + if (NOT "${LANG}" STREQUAL "") + list(APPEND SOURCES "${FILE}") + set(TCOMPILER ${CMAKE_${LANG}_COMPILER_ID}) + endif () + endif () + endforeach () + + # If no gcov binary was found, coverage data can't be evaluated. + if (NOT GCOV_${TCOMPILER}_BIN) + message(WARNING "No coverage evaluation binary found for ${TCOMPILER}.") + return() + endif () + + set(GCOV_BIN "${GCOV_${TCOMPILER}_BIN}") + set(GCOV_ENV "${GCOV_${TCOMPILER}_ENV}") + + + set(TDIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${TNAME}.dir) + set(GENINFO_FILES "") + foreach(FILE ${SOURCES}) + # generate empty coverage files + set(OUTFILE "${TDIR}/${FILE}.info.init") + list(APPEND GENINFO_FILES ${OUTFILE}) + + add_custom_command(OUTPUT ${OUTFILE} COMMAND ${GCOV_ENV} ${GENINFO_BIN} + --quiet --base-directory ${PROJECT_SOURCE_DIR} --initial + --gcov-tool ${GCOV_BIN} --output-filename ${OUTFILE} + ${GENINFO_EXTERN_FLAG} ${TDIR}/${FILE}.gcno + DEPENDS ${TNAME} + COMMENT "Capturing initial coverage data for ${FILE}" + ) + endforeach() + + # Concatenate all files generated by geninfo to a single file per target. + set(OUTFILE "${LCOV_DATA_PATH_INIT}/${TNAME}.info") + set(LCOV_EXTRA_FLAGS "--initial") + lcov_merge_files("${OUTFILE}" ${GENINFO_FILES}) + add_custom_target(${TNAME}-capture-init ALL DEPENDS ${OUTFILE}) + + # add geninfo file generation to global lcov-geninfo target + add_dependencies(lcov-capture-init ${TNAME}-capture-init) + set(LCOV_CAPTURE_INIT_FILES "${LCOV_CAPTURE_INIT_FILES}" + "${OUTFILE}" CACHE INTERNAL "" + ) +endfunction (lcov_capture_initial_tgt) + + +# This function will generate the global info file for all targets. It has to be +# called after all other CMake functions in the root CMakeLists.txt file, to get +# a full list of all targets that generate coverage data. +function (lcov_capture_initial) + # Skip this function (and do not create the following targets), if there are + # no input files. + if ("${LCOV_CAPTURE_INIT_FILES}" STREQUAL "") + return() + endif () + + # Add a new target to merge the files of all targets. + set(OUTFILE "${LCOV_DATA_PATH_INIT}/all_targets.info") + lcov_merge_files("${OUTFILE}" ${LCOV_CAPTURE_INIT_FILES}) + add_custom_target(lcov-geninfo-init ALL DEPENDS ${OUTFILE} + lcov-capture-init + ) +endfunction (lcov_capture_initial) + + + + +# Add a new global target to generate coverage reports for all targets. This +# target will be used to generate the global info file. +if (NOT TARGET lcov-capture) + add_custom_target(lcov-capture) + set(LCOV_CAPTURE_FILES "" CACHE INTERNAL "") +endif (NOT TARGET lcov-capture) + + +# This function will add capture of coverage data for target , which is +# needed to get also data for objects, which were not loaded at execution time. +# It will call geninfo for every source file of once and store the info +# file in the same directory. +function (lcov_capture_tgt TNAME) + # We don't have to check, if the target has support for coverage, thus this + # will be checked by add_coverage_target in Findcoverage.cmake. Instead we + # have to determine which gcov binary to use. + get_target_property(TSOURCES ${TNAME} SOURCES) + set(SOURCES "") + set(TCOMPILER "") + foreach (FILE ${TSOURCES}) + codecov_path_of_source(${FILE} FILE) + if (NOT "${FILE}" STREQUAL "") + codecov_lang_of_source(${FILE} LANG) + if (NOT "${LANG}" STREQUAL "") + list(APPEND SOURCES "${FILE}") + set(TCOMPILER ${CMAKE_${LANG}_COMPILER_ID}) + endif () + endif () + endforeach () + + # If no gcov binary was found, coverage data can't be evaluated. + if (NOT GCOV_${TCOMPILER}_BIN) + message(WARNING "No coverage evaluation binary found for ${TCOMPILER}.") + return() + endif () + + set(GCOV_BIN "${GCOV_${TCOMPILER}_BIN}") + set(GCOV_ENV "${GCOV_${TCOMPILER}_ENV}") + + + set(TDIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${TNAME}.dir) + set(GENINFO_FILES "") + foreach(FILE ${SOURCES}) + # Generate coverage files. If no .gcda file was generated during + # execution, the empty coverage file will be used instead. + set(OUTFILE "${TDIR}/${FILE}.info") + list(APPEND GENINFO_FILES ${OUTFILE}) + + add_custom_command(OUTPUT ${OUTFILE} + COMMAND test -f "${TDIR}/${FILE}.gcda" + && ${GCOV_ENV} ${GENINFO_BIN} --quiet --base-directory + ${PROJECT_SOURCE_DIR} --gcov-tool ${GCOV_BIN} + --output-filename ${OUTFILE} ${GENINFO_EXTERN_FLAG} + ${TDIR}/${FILE}.gcda + || cp ${OUTFILE}.init ${OUTFILE} + DEPENDS ${TNAME} ${TNAME}-capture-init + COMMENT "Capturing coverage data for ${FILE}" + ) + endforeach() + + # Concatenate all files generated by geninfo to a single file per target. + set(OUTFILE "${LCOV_DATA_PATH_CAPTURE}/${TNAME}.info") + lcov_merge_files("${OUTFILE}" ${GENINFO_FILES}) + add_custom_target(${TNAME}-geninfo DEPENDS ${OUTFILE}) + + # add geninfo file generation to global lcov-capture target + add_dependencies(lcov-capture ${TNAME}-geninfo) + set(LCOV_CAPTURE_FILES "${LCOV_CAPTURE_FILES}" "${OUTFILE}" CACHE INTERNAL + "" + ) + + # Add target for generating html output for this target only. + file(MAKE_DIRECTORY ${LCOV_HTML_PATH}/${TNAME}) + add_custom_target(${TNAME}-genhtml + COMMAND ${GENHTML_BIN} --quiet --sort --prefix ${PROJECT_SOURCE_DIR} + --baseline-file ${LCOV_DATA_PATH_INIT}/${TNAME}.info + --output-directory ${LCOV_HTML_PATH}/${TNAME} + --title "${CMAKE_PROJECT_NAME} - target ${TNAME}" + ${GENHTML_CPPFILT_FLAG} ${OUTFILE} + DEPENDS ${TNAME}-geninfo ${TNAME}-capture-init + ) +endfunction (lcov_capture_tgt) + + +# This function will generate the global info file for all targets. It has to be +# called after all other CMake functions in the root CMakeLists.txt file, to get +# a full list of all targets that generate coverage data. +function (lcov_capture) + # Skip this function (and do not create the following targets), if there are + # no input files. + if ("${LCOV_CAPTURE_FILES}" STREQUAL "") + return() + endif () + + # Add a new target to merge the files of all targets. + set(OUTFILE "${LCOV_DATA_PATH_CAPTURE}/all_targets.info") + lcov_merge_files("${OUTFILE}" ${LCOV_CAPTURE_FILES}) + add_custom_target(lcov-geninfo DEPENDS ${OUTFILE} lcov-capture) + + # Add a new global target for all lcov targets. This target could be used to + # generate the lcov html output for the whole project instead of calling + # -geninfo and -genhtml for each target. It will also be + # used to generate a html site for all project data together instead of one + # for each target. + if (NOT TARGET lcov) + file(MAKE_DIRECTORY ${LCOV_HTML_PATH}/all_targets) + add_custom_target(lcov + COMMAND ${GENHTML_BIN} --quiet --sort + --baseline-file ${LCOV_DATA_PATH_INIT}/all_targets.info + --output-directory ${LCOV_HTML_PATH}/all_targets + --title "${CMAKE_PROJECT_NAME}" --prefix "${PROJECT_SOURCE_DIR}" + ${GENHTML_CPPFILT_FLAG} ${OUTFILE} + DEPENDS lcov-geninfo-init lcov-geninfo + ) + endif () +endfunction (lcov_capture) + + + + +# Add a new global target to generate the lcov html report for the whole project +# instead of calling -genhtml for each target (to create an own report +# for each target). Instead of the lcov target it does not require geninfo for +# all targets, so you have to call -geninfo to generate the info files +# the targets you'd like to have in your report or lcov-geninfo for generating +# info files for all targets before calling lcov-genhtml. +file(MAKE_DIRECTORY ${LCOV_HTML_PATH}/selected_targets) +if (NOT TARGET lcov-genhtml) + add_custom_target(lcov-genhtml + COMMAND ${GENHTML_BIN} + --quiet + --output-directory ${LCOV_HTML_PATH}/selected_targets + --title \"${CMAKE_PROJECT_NAME} - targets `find + ${LCOV_DATA_PATH_CAPTURE} -name \"*.info\" ! -name + \"all_targets.info\" -exec basename {} .info \\\;`\" + --prefix ${PROJECT_SOURCE_DIR} + --sort + ${GENHTML_CPPFILT_FLAG} + `find ${LCOV_DATA_PATH_CAPTURE} -name \"*.info\" ! -name + \"all_targets.info\"` + ) +endif (NOT TARGET lcov-genhtml) diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Findcodecov.cmake b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Findcodecov.cmake new file mode 100644 index 00000000..2c0f2fee --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/Findcodecov.cmake @@ -0,0 +1,258 @@ +# This file is part of CMake-codecov. +# +# Copyright (c) +# 2015-2017 RWTH Aachen University, Federal Republic of Germany +# +# See the LICENSE file in the package base directory for details +# +# Written by Alexander Haase, alexander.haase@rwth-aachen.de +# + + +# Add an option to choose, if coverage should be enabled or not. If enabled +# marked targets will be build with coverage support and appropriate targets +# will be added. If disabled coverage will be ignored for *ALL* targets. +option(ENABLE_COVERAGE "Enable coverage build." OFF) + +set(COVERAGE_FLAG_CANDIDATES + # gcc and clang + "-O0 -g -fprofile-arcs -ftest-coverage" + + # gcc and clang fallback + "-O0 -g --coverage" +) + + +# Add coverage support for target ${TNAME} and register target for coverage +# evaluation. If coverage is disabled or not supported, this function will +# simply do nothing. +# +# Note: This function is only a wrapper to define this function always, even if +# coverage is not supported by the compiler or disabled. This function must +# be defined here, because the module will be exited, if there is no coverage +# support by the compiler or it is disabled by the user. +function (add_coverage TNAME) + # only add coverage for target, if coverage is support and enabled. + if (ENABLE_COVERAGE) + foreach (TNAME ${ARGV}) + add_coverage_target(${TNAME}) + endforeach () + endif () +endfunction (add_coverage) + + +# Add global target to gather coverage information after all targets have been +# added. Other evaluation functions could be added here, after checks for the +# specific module have been passed. +# +# Note: This function is only a wrapper to define this function always, even if +# coverage is not supported by the compiler or disabled. This function must +# be defined here, because the module will be exited, if there is no coverage +# support by the compiler or it is disabled by the user. +function (coverage_evaluate) + # add lcov evaluation + if (LCOV_FOUND) + lcov_capture_initial() + lcov_capture() + endif (LCOV_FOUND) +endfunction () + + +# Exit this module, if coverage is disabled. add_coverage is defined before this +# return, so this module can be exited now safely without breaking any build- +# scripts. +if (NOT ENABLE_COVERAGE) + return() +endif () + + + + +# Find the reuired flags foreach language. +set(CMAKE_REQUIRED_QUIET_SAVE ${CMAKE_REQUIRED_QUIET}) +set(CMAKE_REQUIRED_QUIET ${codecov_FIND_QUIETLY}) + +get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) +foreach (LANG ${ENABLED_LANGUAGES}) + # Coverage flags are not dependent on language, but the used compiler. So + # instead of searching flags foreach language, search flags foreach compiler + # used. + set(COMPILER ${CMAKE_${LANG}_COMPILER_ID}) + if (NOT COVERAGE_${COMPILER}_FLAGS) + foreach (FLAG ${COVERAGE_FLAG_CANDIDATES}) + if(NOT CMAKE_REQUIRED_QUIET) + message(STATUS "Try ${COMPILER} code coverage flag = [${FLAG}]") + endif() + + set(CMAKE_REQUIRED_FLAGS "${FLAG}") + unset(COVERAGE_FLAG_DETECTED CACHE) + + if (${LANG} STREQUAL "C") + include(CheckCCompilerFlag) + check_c_compiler_flag("${FLAG}" COVERAGE_FLAG_DETECTED) + + elseif (${LANG} STREQUAL "CXX") + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag("${FLAG}" COVERAGE_FLAG_DETECTED) + + elseif (${LANG} STREQUAL "Fortran") + # CheckFortranCompilerFlag was introduced in CMake 3.x. To be + # compatible with older Cmake versions, we will check if this + # module is present before we use it. Otherwise we will define + # Fortran coverage support as not available. + include(CheckFortranCompilerFlag OPTIONAL + RESULT_VARIABLE INCLUDED) + if (INCLUDED) + check_fortran_compiler_flag("${FLAG}" + COVERAGE_FLAG_DETECTED) + elseif (NOT CMAKE_REQUIRED_QUIET) + message("-- Performing Test COVERAGE_FLAG_DETECTED") + message("-- Performing Test COVERAGE_FLAG_DETECTED - Failed" + " (Check not supported)") + endif () + endif() + + if (COVERAGE_FLAG_DETECTED) + set(COVERAGE_${COMPILER}_FLAGS "${FLAG}" + CACHE STRING "${COMPILER} flags for code coverage.") + mark_as_advanced(COVERAGE_${COMPILER}_FLAGS) + break() + else () + message(WARNING "Code coverage is not available for ${COMPILER}" + " compiler. Targets using this compiler will be " + "compiled without it.") + endif () + endforeach () + endif () +endforeach () + +set(CMAKE_REQUIRED_QUIET ${CMAKE_REQUIRED_QUIET_SAVE}) + + + + +# Helper function to get the language of a source file. +function (codecov_lang_of_source FILE RETURN_VAR) + get_filename_component(FILE_EXT "${FILE}" EXT) + string(TOLOWER "${FILE_EXT}" FILE_EXT) + string(SUBSTRING "${FILE_EXT}" 1 -1 FILE_EXT) + + get_property(ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + foreach (LANG ${ENABLED_LANGUAGES}) + list(FIND CMAKE_${LANG}_SOURCE_FILE_EXTENSIONS "${FILE_EXT}" TEMP) + if (NOT ${TEMP} EQUAL -1) + set(${RETURN_VAR} "${LANG}" PARENT_SCOPE) + return() + endif () + endforeach() + + set(${RETURN_VAR} "" PARENT_SCOPE) +endfunction () + + +# Helper function to get the relative path of the source file destination path. +# This path is needed by FindGcov and FindLcov cmake files to locate the +# captured data. +function (codecov_path_of_source FILE RETURN_VAR) + string(REGEX MATCH "TARGET_OBJECTS:([^ >]+)" _source ${FILE}) + + # If expression was found, SOURCEFILE is a generator-expression for an + # object library. Currently we found no way to call this function automatic + # for the referenced target, so it must be called in the directoryso of the + # object library definition. + if (NOT "${_source}" STREQUAL "") + set(${RETURN_VAR} "" PARENT_SCOPE) + return() + endif () + + + string(REPLACE "${CMAKE_CURRENT_BINARY_DIR}/" "" FILE "${FILE}") + if(IS_ABSOLUTE ${FILE}) + file(RELATIVE_PATH FILE ${CMAKE_CURRENT_SOURCE_DIR} ${FILE}) + endif() + + # get the right path for file + string(REPLACE ".." "__" PATH "${FILE}") + + set(${RETURN_VAR} "${PATH}" PARENT_SCOPE) +endfunction() + + + + +# Add coverage support for target ${TNAME} and register target for coverage +# evaluation. +function(add_coverage_target TNAME) + # Check if all sources for target use the same compiler. If a target uses + # e.g. C and Fortran mixed and uses different compilers (e.g. clang and + # gfortran) this can trigger huge problems, because different compilers may + # use different implementations for code coverage. + get_target_property(TSOURCES ${TNAME} SOURCES) + set(TARGET_COMPILER "") + set(ADDITIONAL_FILES "") + foreach (FILE ${TSOURCES}) + # If expression was found, FILE is a generator-expression for an object + # library. Object libraries will be ignored. + string(REGEX MATCH "TARGET_OBJECTS:([^ >]+)" _file ${FILE}) + if ("${_file}" STREQUAL "") + codecov_lang_of_source(${FILE} LANG) + if (LANG) + list(APPEND TARGET_COMPILER ${CMAKE_${LANG}_COMPILER_ID}) + + list(APPEND ADDITIONAL_FILES "${FILE}.gcno") + list(APPEND ADDITIONAL_FILES "${FILE}.gcda") + endif () + endif () + endforeach () + + list(REMOVE_DUPLICATES TARGET_COMPILER) + list(LENGTH TARGET_COMPILER NUM_COMPILERS) + + if (NUM_COMPILERS GREATER 1) + message(WARNING "Can't use code coverage for target ${TNAME}, because " + "it will be compiled by incompatible compilers. Target will be " + "compiled without code coverage.") + return() + + elseif (NUM_COMPILERS EQUAL 0) + message(WARNING "Can't use code coverage for target ${TNAME}, because " + "it uses an unknown compiler. Target will be compiled without " + "code coverage.") + return() + + elseif (NOT DEFINED "COVERAGE_${TARGET_COMPILER}_FLAGS") + # A warning has been printed before, so just return if flags for this + # compiler aren't available. + return() + endif() + + + # enable coverage for target + set_property(TARGET ${TNAME} APPEND_STRING + PROPERTY COMPILE_FLAGS " ${COVERAGE_${TARGET_COMPILER}_FLAGS}") + set_property(TARGET ${TNAME} APPEND_STRING + PROPERTY LINK_FLAGS " ${COVERAGE_${TARGET_COMPILER}_FLAGS}") + + + # Add gcov files generated by compiler to clean target. + set(CLEAN_FILES "") + foreach (FILE ${ADDITIONAL_FILES}) + codecov_path_of_source(${FILE} FILE) + list(APPEND CLEAN_FILES "CMakeFiles/${TNAME}.dir/${FILE}") + endforeach() + + set_directory_properties(PROPERTIES ADDITIONAL_MAKE_CLEAN_FILES + "${CLEAN_FILES}") + + + add_gcov_target(${TNAME}) + add_lcov_target(${TNAME}) +endfunction(add_coverage_target) + + + + +# Include modules for parsing the collected data and output it in a readable +# format (like gcov and lcov). +find_package(Gcov) +find_package(Lcov) diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2-with-main.pc.in b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2-with-main.pc.in new file mode 100644 index 00000000..69a790bb --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2-with-main.pc.in @@ -0,0 +1,10 @@ +includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ +libdir=@CMAKE_INSTALL_FULL_LIBDIR@ +pkg_version=@Catch2_VERSION@ + +Name: Catch2-With-Main +Description: A modern, C++-native test framework for C++14 and above (links in default main) +Version: ${pkg_version} +Requires: catch2 = ${pkg_version} +Cflags: -I${includedir} +Libs: -L${libdir} -lCatch2Main diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2.pc.in b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2.pc.in new file mode 100644 index 00000000..bd1c95a1 --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/catch2.pc.in @@ -0,0 +1,11 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix} +includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@ +libdir=@CMAKE_INSTALL_FULL_LIBDIR@ + +Name: Catch2 +Description: A modern, C++-native, test framework for C++14 and above +URL: https://github.com/catchorg/Catch2 +Version: @Catch2_VERSION@ +Cflags: -I${includedir} +Libs: -L${libdir} -lCatch2 diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMake/llvm-cov-wrapper b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/llvm-cov-wrapper new file mode 100755 index 00000000..2ac33102 --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMake/llvm-cov-wrapper @@ -0,0 +1,56 @@ +#!/bin/sh + +# This file is part of CMake-codecov. +# +# Copyright (c) +# 2015-2017 RWTH Aachen University, Federal Republic of Germany +# +# See the LICENSE file in the package base directory for details +# +# Written by Alexander Haase, alexander.haase@rwth-aachen.de +# + +if [ -z "$LLVM_COV_BIN" ] +then + echo "LLVM_COV_BIN not set!" >& 2 + exit 1 +fi + + +# Get LLVM version to find out. +LLVM_VERSION=$($LLVM_COV_BIN -version | grep -i "LLVM version" \ + | sed "s/^\([A-Za-z ]*\)\([0-9]\).\([0-9]\).*$/\2.\3/g") + +if [ "$1" = "-v" ] +then + echo "llvm-cov-wrapper $LLVM_VERSION" + exit 0 +fi + + +if [ -n "$LLVM_VERSION" ] +then + MAJOR=$(echo $LLVM_VERSION | cut -d'.' -f1) + MINOR=$(echo $LLVM_VERSION | cut -d'.' -f2) + + if [ $MAJOR -eq 3 ] && [ $MINOR -le 4 ] + then + if [ -f "$1" ] + then + filename=$(basename "$1") + extension="${filename##*.}" + + case "$extension" in + "gcno") exec $LLVM_COV_BIN --gcno="$1" ;; + "gcda") exec $LLVM_COV_BIN --gcda="$1" ;; + esac + fi + fi + + if [ $MAJOR -eq 3 ] && [ $MINOR -le 5 ] + then + exec $LLVM_COV_BIN $@ + fi +fi + +exec $LLVM_COV_BIN gcov $@ diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMakeLists.txt b/ethosu/regor/dependencies/thirdparty/Catch2/CMakeLists.txt new file mode 100644 index 00000000..ad5d939b --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMakeLists.txt @@ -0,0 +1,203 @@ +cmake_minimum_required(VERSION 3.10) + +# detect if Catch is being bundled, +# disable testsuite in that case +if(NOT DEFINED PROJECT_NAME) + set(NOT_SUBPROJECT ON) +else() + set(NOT_SUBPROJECT OFF) +endif() + +option(CATCH_INSTALL_DOCS "Install documentation alongside library" ON) +option(CATCH_INSTALL_EXTRAS "Install extras (CMake scripts, debugger helpers) alongside library" ON) +option(CATCH_DEVELOPMENT_BUILD "Build tests, enable warnings, enable Werror, etc" OFF) +option(CATCH_ENABLE_REPRODUCIBLE_BUILD "Add compiler flags for improving build reproducibility" ON) + +include(CMakeDependentOption) +cmake_dependent_option(CATCH_BUILD_TESTING "Build the SelfTest project" ON "CATCH_DEVELOPMENT_BUILD" OFF) +cmake_dependent_option(CATCH_BUILD_EXAMPLES "Build code examples" OFF "CATCH_DEVELOPMENT_BUILD" OFF) +cmake_dependent_option(CATCH_BUILD_EXTRA_TESTS "Build extra tests" OFF "CATCH_DEVELOPMENT_BUILD" OFF) +cmake_dependent_option(CATCH_BUILD_FUZZERS "Build fuzzers" OFF "CATCH_DEVELOPMENT_BUILD" OFF) +cmake_dependent_option(CATCH_ENABLE_COVERAGE "Generate coverage for codecov.io" OFF "CATCH_DEVELOPMENT_BUILD" OFF) +cmake_dependent_option(CATCH_ENABLE_WERROR "Enables Werror during build" ON "CATCH_DEVELOPMENT_BUILD" OFF) +cmake_dependent_option(CATCH_BUILD_SURROGATES "Enable generating and building surrogate TUs for the main headers" OFF "CATCH_DEVELOPMENT_BUILD" OFF) +cmake_dependent_option(CATCH_ENABLE_CONFIGURE_TESTS "Enable CMake configuration tests. WARNING: VERY EXPENSIVE" OFF "CATCH_DEVELOPMENT_BUILD" OFF) +cmake_dependent_option(CATCH_ENABLE_CMAKE_HELPER_TESTS "Enable CMake helper tests. WARNING: VERY EXPENSIVE" OFF "CATCH_DEVELOPMENT_BUILD" OFF) + + +# Catch2's build breaks if done in-tree. You probably should not build +# things in tree anyway, but we can allow projects that include Catch2 +# as a subproject to build in-tree as long as it is not in our tree. +if (CMAKE_BINARY_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR) + message(FATAL_ERROR "Building in-source is not supported! Create a build dir and remove ${CMAKE_SOURCE_DIR}/CMakeCache.txt") +endif() + +project(Catch2 + VERSION 3.5.3 # CML version placeholder, don't delete + LANGUAGES CXX + # HOMEPAGE_URL is not supported until CMake version 3.12, which + # we do not target yet. + # HOMEPAGE_URL "https://github.com/catchorg/Catch2" + DESCRIPTION "A modern, C++-native, unit test framework." +) + + +# Provide path for scripts. We first add path to the scripts we don't use, +# but projects including us might, and set the path up to parent scope. +# Then we also add path that we use to configure the project, but is of +# no use to top level projects. +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/extras") +if (NOT NOT_SUBPROJECT) + set(CMAKE_MODULE_PATH "${CMAKE_MODULE_PATH}" PARENT_SCOPE) +endif() +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/CMake") + +include(GNUInstallDirs) +include(CMakePackageConfigHelpers) +include(CatchConfigOptions) +if(CATCH_DEVELOPMENT_BUILD) + include(CTest) +endif() + +# This variable is used in some subdirectories, so we need it here, rather +# than later in the install block +set(CATCH_CMAKE_CONFIG_DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/Catch2") + +# We have some Windows builds that test `wmain` entry point, +# and we need this change to be present in all binaries that +# are built during these tests, so this is required here, before +# the subdirectories are added. +if(CATCH_TEST_USE_WMAIN) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /ENTRY:wmainCRTStartup") +endif() + + +# Basic paths +set(CATCH_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(SOURCES_DIR ${CATCH_DIR}/src/catch2) +set(SELF_TEST_DIR ${CATCH_DIR}/tests/SelfTest) +set(BENCHMARK_DIR ${CATCH_DIR}/tests/Benchmark) +set(EXAMPLES_DIR ${CATCH_DIR}/examples) + +# We need to bring-in the variables defined there to this scope +add_subdirectory(src) + +# Build tests only if requested +if (BUILD_TESTING AND CATCH_BUILD_TESTING AND NOT_SUBPROJECT) + find_package(PythonInterp 3 REQUIRED) + if (NOT PYTHONINTERP_FOUND) + message(FATAL_ERROR "Python not found, but required for tests") + endif() + add_subdirectory(tests) +endif() + +if(CATCH_BUILD_EXAMPLES) + add_subdirectory(examples) +endif() + +if(CATCH_BUILD_EXTRA_TESTS) + add_subdirectory(tests/ExtraTests) +endif() + +if(CATCH_BUILD_FUZZERS) + add_subdirectory(fuzzing) +endif() + +if (CATCH_DEVELOPMENT_BUILD) + add_warnings_to_targets("${CATCH_WARNING_TARGETS}") +endif() + +# Only perform the installation steps when Catch is not being used as +# a subproject via `add_subdirectory`, or the destinations will break, +# see https://github.com/catchorg/Catch2/issues/1373 +if (NOT_SUBPROJECT) + configure_package_config_file( + ${CMAKE_CURRENT_LIST_DIR}/CMake/Catch2Config.cmake.in + ${CMAKE_CURRENT_BINARY_DIR}/Catch2Config.cmake + INSTALL_DESTINATION + ${CATCH_CMAKE_CONFIG_DESTINATION} + ) + + write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/Catch2ConfigVersion.cmake" + COMPATIBILITY + SameMajorVersion + ) + + install( + FILES + "${CMAKE_CURRENT_BINARY_DIR}/Catch2Config.cmake" + "${CMAKE_CURRENT_BINARY_DIR}/Catch2ConfigVersion.cmake" + DESTINATION + ${CATCH_CMAKE_CONFIG_DESTINATION} + ) + + # Install documentation + if(CATCH_INSTALL_DOCS) + install( + DIRECTORY + docs/ + DESTINATION + "${CMAKE_INSTALL_DOCDIR}" + PATTERN "doxygen" EXCLUDE + ) + endif() + + if(CATCH_INSTALL_EXTRAS) + # Install CMake scripts + install( + FILES + "extras/ParseAndAddCatchTests.cmake" + "extras/Catch.cmake" + "extras/CatchAddTests.cmake" + "extras/CatchShardTests.cmake" + "extras/CatchShardTestsImpl.cmake" + DESTINATION + ${CATCH_CMAKE_CONFIG_DESTINATION} + ) + + # Install debugger helpers + install( + FILES + "extras/gdbinit" + "extras/lldbinit" + DESTINATION + ${CMAKE_INSTALL_DATAROOTDIR}/Catch2 + ) + endif() + + ## Provide some pkg-config integration + set(PKGCONFIG_INSTALL_DIR + "${CMAKE_INSTALL_DATAROOTDIR}/pkgconfig" + CACHE PATH "Path where catch2.pc is installed" + ) + configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/CMake/catch2.pc.in + ${CMAKE_CURRENT_BINARY_DIR}/catch2.pc + @ONLY + ) + configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/CMake/catch2-with-main.pc.in + ${CMAKE_CURRENT_BINARY_DIR}/catch2-with-main.pc + @ONLY + ) + install( + FILES + "${CMAKE_CURRENT_BINARY_DIR}/catch2.pc" + "${CMAKE_CURRENT_BINARY_DIR}/catch2-with-main.pc" + DESTINATION + ${PKGCONFIG_INSTALL_DIR} + ) + + # CPack/CMake started taking the package version from project version 3.12 + # So we need to set the version manually for older CMake versions + if(${CMAKE_VERSION} VERSION_LESS "3.12.0") + set(CPACK_PACKAGE_VERSION ${PROJECT_VERSION}) + endif() + + set(CPACK_PACKAGE_CONTACT "https://github.com/catchorg/Catch2/") + + + include( CPack ) + +endif(NOT_SUBPROJECT) diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CMakePresets.json b/ethosu/regor/dependencies/thirdparty/Catch2/CMakePresets.json new file mode 100644 index 00000000..88541285 --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CMakePresets.json @@ -0,0 +1,26 @@ +{ + "version": 3, + "configurePresets": [ + { + "name": "basic-tests", + "displayName": "Basic development build", + "description": "Enables development build with basic tests that are cheap to build and run", + "cacheVariables": { + "CATCH_DEVELOPMENT_BUILD": "ON" + } + }, + { + "name": "all-tests", + "inherits": "basic-tests", + "displayName": "Full development build", + "description": "Enables development build with examples and ALL tests", + "cacheVariables": { + "CATCH_BUILD_EXAMPLES": "ON", + "CATCH_BUILD_EXTRA_TESTS": "ON", + "CATCH_BUILD_SURROGATES": "ON", + "CATCH_ENABLE_CONFIGURE_TESTS": "ON", + "CATCH_ENABLE_CMAKE_HELPER_TESTS": "ON" + } + } + ] +} diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/CODE_OF_CONDUCT.md b/ethosu/regor/dependencies/thirdparty/Catch2/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..be1a688e --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/CODE_OF_CONDUCT.md @@ -0,0 +1,46 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as contributors and maintainers pledge to making participation in our project and our community a harassment-free experience for everyone, regardless of age, body size, disability, ethnicity, gender identity and expression, level of experience, nationality, personal appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable behavior and are expected to take appropriate and fair corrective action in response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or reject comments, commits, code, wiki edits, issues, and other contributions that are not aligned to this Code of Conduct, or to ban temporarily or permanently any contributor for other behaviors that they deem inappropriate, threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces when an individual is representing the project or its community. Examples of representing a project or community include using an official project e-mail address, posting via an official social media account, or acting as an appointed representative at an online or offline event. Representation of a project may be further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be reported by contacting the project team at github@philnash.me. The project team will review and investigate all complaints, and will respond in a way that it deems appropriate to the circumstances. The project team is obligated to maintain confidentiality with regard to the reporter of an incident. Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good faith may face temporary or permanent repercussions as determined by other members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, available at [http://contributor-covenant.org/version/1/4][version] + +[homepage]: http://contributor-covenant.org +[version]: http://contributor-covenant.org/version/1/4/ diff --git a/ethosu/regor/dependencies/thirdparty/Catch2/Doxyfile b/ethosu/regor/dependencies/thirdparty/Catch2/Doxyfile new file mode 100644 index 00000000..914e5984 --- /dev/null +++ b/ethosu/regor/dependencies/thirdparty/Catch2/Doxyfile @@ -0,0 +1,2650 @@ +# Doxyfile 1.9.1 + +# This file describes the settings to be used by the documentation system +# doxygen (www.doxygen.org) for a project. +# +# All text after a double hash (##) is considered a comment and is placed in +# front of the TAG it is preceding. +# +# All text after a single hash (#) is considered a comment and will be ignored. +# The format is: +# TAG = value [value, ...] +# For lists, items can also be appended using: +# TAG += value [value, ...] +# Values that contain spaces should be placed between quotes (\" \"). + +#--------------------------------------------------------------------------- +# Project related configuration options +#--------------------------------------------------------------------------- + +# This tag specifies the encoding used for all characters in the configuration +# file that follow. The default is UTF-8 which is also the encoding used for all +# text before the first occurrence of this tag. Doxygen uses libiconv (or the +# iconv built into libc) for the transcoding. See +# https://www.gnu.org/software/libiconv/ for the list of possible encodings. +# The default value is: UTF-8. + +DOXYFILE_ENCODING = UTF-8 + +# The PROJECT_NAME tag is a single word (or a sequence of words surrounded by +# double-quotes, unless you are using Doxywizard) that should identify the +# project for which the documentation is generated. This name is used in the +# title of most generated pages and in a few other places. +# The default value is: My Project. + +PROJECT_NAME = Catch2 + +# The PROJECT_NUMBER tag can be used to enter a project or revision number. This +# could be handy for archiving the generated documentation or if some version +# control system is used. + +PROJECT_NUMBER = + +# Using the PROJECT_BRIEF tag one can provide an optional one line description +# for a project that appears at the top of each page and should give viewer a +# quick idea about the purpose of the project. Keep the description short. + +PROJECT_BRIEF = "Popular C++ unit testing framework" + +# With the PROJECT_LOGO tag one can specify a logo or an icon that is included +# in the documentation. The maximum height of the logo should not exceed 55 +# pixels and the maximum width should not exceed 200 pixels. Doxygen will copy +# the logo to the output directory. + +PROJECT_LOGO = + +# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path +# into which the generated documentation will be written. If a relative path is +# entered, it will be relative to the location where doxygen was started. If +# left blank the current directory will be used. + +OUTPUT_DIRECTORY = docs/doxygen + +# If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- +# directories (in 2 levels) under the output directory of each output format and +# will distribute the generated files over these directories. Enabling this +# option can be useful when feeding doxygen a huge amount of source files, where +# putting all generated files in the same directory would otherwise causes +# performance problems for the file system. +# The default value is: NO. + +CREATE_SUBDIRS = NO + +# If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII +# characters to appear in the names of generated files. If set to NO, non-ASCII +# characters will be escaped, for example _xE3_x81_x84 will be used for Unicode +# U+3044. +# The default value is: NO. + +ALLOW_UNICODE_NAMES = NO + +# The OUTPUT_LANGUAGE tag is used to specify the language in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all constant output in the proper language. +# Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, +# Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), +# Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, +# Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), +# Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, +# Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, +# Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, +# Ukrainian and Vietnamese. +# The default value is: English. + +OUTPUT_LANGUAGE = English + +# The OUTPUT_TEXT_DIRECTION tag is used to specify the direction in which all +# documentation generated by doxygen is written. Doxygen will use this +# information to generate all generated output in the proper direction. +# Possible values are: None, LTR, RTL and Context. +# The default value is: None. + +OUTPUT_TEXT_DIRECTION = None + +# If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member +# descriptions after the members that are listed in the file and class +# documentation (similar to Javadoc). Set to NO to disable this. +# The default value is: YES. + +BRIEF_MEMBER_DESC = YES + +# If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief +# description of a member or function before the detailed description +# +# Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the +# brief descriptions will be completely suppressed. +# The default value is: YES. + +REPEAT_BRIEF = YES + +# This tag implements a quasi-intelligent brief description abbreviator that is +# used to form the text in various listings. Each string in this list, if found +# as the leading text of the brief description, will be stripped from the text +# and the result, after processing the whole list, is used as the annotated +# text. Otherwise, the brief description is used as-is. If left blank, the +# following values are used ($name is automatically replaced with the name of +# the entity):The $name class, The $name widget, The $name file, is, provides, +# specifies, contains, represents, a, an and the. + +ABBREVIATE_BRIEF = + +# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then +# doxygen will generate a detailed section even if there is only a brief +# description. +# The default value is: NO. + +ALWAYS_DETAILED_SEC = NO + +# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all +# inherited members of a class in the documentation of that class as if those +# members were ordinary class members. Constructors, destructors and assignment +# operators of the base classes will not be shown. +# The default value is: NO. + +INLINE_INHERITED_MEMB = NO + +# If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path +# before files name in the file list and in the header files. If set to NO the +# shortest path that makes the file name unique will be used +# The default value is: YES. + +FULL_PATH_NAMES = YES + +# The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. +# Stripping is only done if one of the specified strings matches the left-hand +# part of the path. The tag can be used to show relative paths in the file list. +# If left blank the directory from which doxygen is run is used as the path to +# strip. +# +# Note that you can specify absolute paths here, but also relative paths, which +# will be relative from the directory where doxygen is started. +# This tag requires that the tag FULL_PATH_NAMES is set to YES. + +STRIP_FROM_PATH = + +# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the +# path mentioned in the documentation of a class, which tells the reader which +# header file to include in order to use a class. If left blank only the name of +# the header file containing the class definition is used. Otherwise one should +# specify the list of include paths that are normally passed to the compiler +# using the -I flag. + +STRIP_FROM_INC_PATH = + +# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but +# less readable) file names. This can be useful is your file systems doesn't +# support long names like on DOS, Mac, or CD-ROM. +# The default value is: NO. + +SHORT_NAMES = NO + +# If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the +# first line (until the first dot) of a Javadoc-style comment as the brief +# description. If set to NO, the Javadoc-style will behave just like regular Qt- +# style comments (thus requiring an explicit @brief command for a brief +# description.) +# The default value is: NO. + +JAVADOC_AUTOBRIEF = YES + +# If the JAVADOC_BANNER tag is set to YES then doxygen will interpret a line +# such as +# /*************** +# as being the beginning of a Javadoc-style comment "banner". If set to NO, the +# Javadoc-style will behave just like regular comments and it will not be +# interpreted by doxygen. +# The default value is: NO. + +JAVADOC_BANNER = NO + +# If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first +# line (until the first dot) of a Qt-style comment as the brief description. If +# set to NO, the Qt-style will behave just like regular Qt-style comments (thus +# requiring an explicit \brief command for a brief description.) +# The default value is: NO. + +QT_AUTOBRIEF = YES + +# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a +# multi-line C++ special comment block (i.e. a block of //! or /// comments) as +# a brief description. This used to be the default behavior. The new default is +# to treat a multi-line C++ comment block as a detailed description. Set this +# tag to YES if you prefer the old behavior instead. +# +# Note that setting this tag to YES also means that rational rose comments are +# not recognized any more. +# The default value is: NO. + +MULTILINE_CPP_IS_BRIEF = NO + +# By default Python docstrings are displayed as preformatted text and doxygen's +# special commands cannot be used. By setting PYTHON_DOCSTRING to NO the +# doxygen's special commands can be used and the contents of the docstring +# documentation blocks is shown as doxygen documentation. +# The default value is: YES. + +PYTHON_DOCSTRING = YES + +# If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the +# documentation from any documented member that it re-implements. +# The default value is: YES. + +INHERIT_DOCS = YES + +# If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new +# page for each member. If set to NO, the documentation of a member will be part +# of the file/class/namespace that contains it. +# The default value is: NO. + +SEPARATE_MEMBER_PAGES = NO + +# The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen +# uses this value to replace tabs by spaces in code fragments. +# Minimum value: 1, maximum value: 16, default value: 4. + +TAB_SIZE = 4 + +# This tag can be used to specify a number of aliases that act as commands in +# the documentation. An alias has the form: +# name=value +# For example adding +# "sideeffect=@par Side Effects:\n" +# will allow you to put the command \sideeffect (or @sideeffect) in the +# documentation, which will result in a user-defined paragraph with heading +# "Side Effects:". You can put \n's in the value part of an alias to insert +# newlines (in the resulting output). You can put ^^ in the value part of an +# alias to insert a newline as if a physical newline was in the original file. +# When you need a literal { or } or , in the value part of an alias you have to +# escape them by means of a backslash (\), this can lead to conflicts with the +# commands \{ and \} for these it is advised to use the version @{ and @} or use +# a double escape (\\{ and \\}) + +ALIASES = "complexity=@par Complexity:" \ + noexcept=**Noexcept** + +# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources +# only. Doxygen will then generate output that is more tailored for C. For +# instance, some of the names that are used will be different. The list of all +# members will be omitted, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_FOR_C = NO + +# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or +# Python sources only. Doxygen will then generate output that is more tailored +# for that language. For instance, namespaces will be presented as packages, +# qualified scopes will look different, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_JAVA = NO + +# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran +# sources. Doxygen will then generate output that is tailored for Fortran. +# The default value is: NO. + +OPTIMIZE_FOR_FORTRAN = NO + +# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL +# sources. Doxygen will then generate output that is tailored for VHDL. +# The default value is: NO. + +OPTIMIZE_OUTPUT_VHDL = NO + +# Set the OPTIMIZE_OUTPUT_SLICE tag to YES if your project consists of Slice +# sources only. Doxygen will then generate output that is more tailored for that +# language. For instance, namespaces will be presented as modules, types will be +# separated into more groups, etc. +# The default value is: NO. + +OPTIMIZE_OUTPUT_SLICE = NO + +# Doxygen selects the parser to use depending on the extension of the files it +# parses. With this tag you can assign which parser to use for a given +# extension. Doxygen has a built-in mapping, but you can override or extend it +# using this tag. The format is ext=language, where ext is a file extension, and +# language is one of the parsers supported by doxygen: IDL, Java, JavaScript, +# Csharp (C#), C, C++, D, PHP, md (Markdown), Objective-C, Python, Slice, VHDL, +# Fortran (fixed format Fortran: FortranFixed, free formatted Fortran: +# FortranFree, unknown formatted Fortran: Fortran. In the later case the parser +# tries to guess whether the code is fixed or free formatted code, this is the +# default for Fortran type files). For instance to make doxygen treat .inc files +# as Fortran files (default is PHP), and .f files as C (default is Fortran), +# use: inc=Fortran f=C. +# +# Note: For files without extension you can use no_extension as a placeholder. +# +# Note that for custom extensions you also need to set FILE_PATTERNS otherwise +# the files are not read by doxygen. When specifying no_extension you should add +# * to the FILE_PATTERNS. +# +# Note see also the list of default file extension mappings. + +EXTENSION_MAPPING = + +# If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments +# according to the Markdown format, which allows for more readable +# documentation. See https://daringfireball.net/projects/markdown/ for details. +# The output of markdown processing is further processed by doxygen, so you can +# mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in +# case of backward compatibilities issues. +# The default value is: YES. + +MARKDOWN_SUPPORT = YES + +# When the TOC_INCLUDE_HEADINGS tag is set to a non-zero value, all headings up +# to that level are automatically included in the table of contents, even if +# they do not have an id attribute. +# Note: This feature currently applies only to Markdown headings. +# Minimum value: 0, maximum value: 99, default value: 5. +# This tag requires that the tag MARKDOWN_SUPPORT is set to YES. + +TOC_INCLUDE_HEADINGS = 5 + +# When enabled doxygen tries to link words that correspond to documented +# classes, or namespaces to their corresponding documentation. Such a link can +# be prevented in individual cases by putting a % sign in front of the word or +# globally by setting AUTOLINK_SUPPORT to NO. +# The default value is: YES. + +AUTOLINK_SUPPORT = YES + +# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want +# to include (a tag file for) the STL sources as input, then you should set this +# tag to YES in order to let doxygen match functions declarations and +# definitions whose arguments contain STL classes (e.g. func(std::string); +# versus func(std::string) {}). This also make the inheritance and collaboration +# diagrams that involve STL classes more complete and accurate. +# The default value is: NO. + +BUILTIN_STL_SUPPORT = YES + +# If you use Microsoft's C++/CLI language, you should set this option to YES to +# enable parsing support. +# The default value is: NO. + +CPP_CLI_SUPPORT = NO + +# Set the SIP_SUPPORT tag to YES if your project consists of sip (see: +# https://www.riverbankcomputing.com/software/sip/intro) sources only. Doxygen +# will parse them like normal C++ but will assume all classes use public instead +# of private inheritance when no explicit protection keyword is present. +# The default value is: NO. + +SIP_SUPPORT = NO + +# For Microsoft's IDL there are propget and propput attributes to indicate +# getter and setter methods for a property. Setting this option to YES will make +# doxygen to replace the get and set methods by a property in the documentation. +# This will only work if the methods are indeed getting or setting a simple +# type. If this is not the case, or you want to show the methods anyway, you +# should set this option to NO. +# The default value is: YES. + +IDL_PROPERTY_SUPPORT = YES + +# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC +# tag is set to YES then doxygen will reuse the documentation of the first +# member in the group (if any) for the other members of the group. By default +# all members of a group must be documented explicitly. +# The default value is: NO. + +DISTRIBUTE_GROUP_DOC = NO + +# If one adds a struct or class to a group and this option is enabled, then also +# any nested class or struct is added to the same group. By default this option +# is disabled and one has to add nested compounds explicitly via \ingroup. +# The default value is: NO. + +GROUP_NESTED_COMPOUNDS = NO + +# Set the SUBGROUPING tag to YES to allow class member groups of the same type +# (for instance a group of public functions) to be put as a subgroup of that +# type (e.g. under the Public Functions section). Set it to NO to prevent +# subgrouping. Alternatively, this can be done per class using the +# \nosubgrouping command. +# The default value is: YES. + +SUBGROUPING = YES + +# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions +# are shown inside the group in which they are included (e.g. using \ingroup) +# instead of on a separate page (for HTML and Man pages) or section (for LaTeX +# and RTF). +# +# Note that this feature does not work in combination with +# SEPARATE_MEMBER_PAGES. +# The default value is: NO. + +INLINE_GROUPED_CLASSES = YES + +# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions +# with only public data fields or simple typedef fields will be shown inline in +# the documentation of the scope in which they are defined (i.e. file, +# namespace, or group documentation), provided this scope is documented. If set +# to NO, structs, classes, and unions are shown on a separate page (for HTML and +# Man pages) or section (for LaTeX and RTF). +# The default value is: NO. + +INLINE_SIMPLE_STRUCTS = YES + +# When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or +# enum is documented as struct, union, or enum with the name of the typedef. So +# typedef struct TypeS {} TypeT, will appear in the documentation as a struct +# with name TypeT. When disabled the typedef will appear as a member of a file, +# namespace, or class. And the struct will be named TypeS. This can typically be +# useful for C code in case the coding convention dictates that all compound +# types are typedef'ed and only the typedef is referenced, never the tag name. +# The default value is: NO. + +TYPEDEF_HIDES_STRUCT = NO + +# The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This +# cache is used to resolve symbols given their name and scope. Since this can be +# an expensive process and often the same symbol appears multiple times in the +# code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small +# doxygen will become slower. If the cache is too large, memory is wasted. The +# cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range +# is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 +# symbols. At the end of a run doxygen will report the cache usage and suggest +# the optimal cache size from a speed point of view. +# Minimum value: 0, maximum value: 9, default value: 0. + +LOOKUP_CACHE_SIZE = 0 + +# The NUM_PROC_THREADS specifies the number threads doxygen is allowed to use +# during processing. When set to 0 doxygen will based this on the number of +# cores available in the system. You can set it explicitly to a value larger +# than 0 to get more control over the balance between CPU load and processing +# speed. At this moment only the input processing can be done using multiple +# threads. Since this is still an experimental feature the default is set to 1, +# which efficively disables parallel processing. Please report any issues you +# encounter. Generating dot graphs in parallel is controlled by the +# DOT_NUM_THREADS setting. +# Minimum value: 0, maximum value: 32, default value: 1. + +NUM_PROC_THREADS = 1 + +#--------------------------------------------------------------------------- +# Build related configuration options +#--------------------------------------------------------------------------- + +# If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in +# documentation are documented, even if no documentation was available. Private +# class members and static file members will be hidden unless the +# EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. +# Note: This will also disable the warnings about undocumented members that are +# normally produced when WARNINGS is set to YES. +# The default value is: NO. + +EXTRACT_ALL = YES + +# If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will +# be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIVATE = NO + +# If the EXTRACT_PRIV_VIRTUAL tag is set to YES, documented private virtual +# methods of a class will be included in the documentation. +# The default value is: NO. + +EXTRACT_PRIV_VIRTUAL = NO + +# If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal +# scope will be included in the documentation. +# The default value is: NO. + +EXTRACT_PACKAGE = YES + +# If the EXTRACT_STATIC tag is set to YES, all static members of a file will be +# included in the documentation. +# The default value is: NO. + +EXTRACT_STATIC = YES + +# If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined +# locally in source files will be included in the documentation. If set to NO, +# only classes defined in header files are included. Does not have any effect +# for Java sources. +# The default value is: YES. + +EXTRACT_LOCAL_CLASSES = NO + +# This flag is only useful for Objective-C code. If set to YES, local methods, +# which are defined in the implementation section but not in the interface are +# included in the documentation. If set to NO, only methods in the interface are +# included. +# The default value is: NO. + +EXTRACT_LOCAL_METHODS = NO + +# If this flag is set to YES, the members of anonymous namespaces will be +# extracted and appear in the documentation as a namespace called +# 'anonymous_namespace{file}', where file will be replaced with the base name of +# the file that contains the anonymous namespace. By default anonymous namespace +# are hidden. +# The default value is: NO. + +EXTRACT_ANON_NSPACES = NO + +# If this flag is set to YES, the name of an unnamed parameter in a declaration +# will be determined by the corresponding definition. By default unnamed +# parameters remain unnamed in the output. +# The default value is: YES. + +RESOLVE_UNNAMED_PARAMS = YES + +# If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all +# undocumented members inside documented classes or files. If set to NO these +# members will be included in the various overviews, but no documentation +# section is generated. This option has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_MEMBERS = NO + +# If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all +# undocumented classes that are normally visible in the class hierarchy. If set +# to NO, these classes will be included in the various overviews. This option +# has no effect if EXTRACT_ALL is enabled. +# The default value is: NO. + +HIDE_UNDOC_CLASSES = NO + +# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend +# declarations. If set to NO, these declarations will be included in the +# documentation. +# The default value is: NO. + +HIDE_FRIEND_COMPOUNDS = NO + +# If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any +# documentation blocks found inside the body of a function. If set to NO, these +# blocks will be appended to the function's detailed documentation block. +# The default value is: NO. + +HIDE_IN_BODY_DOCS = NO + +# The INTERNAL_DOCS tag determines if documentation that is typed after a +# \internal command is included. If the tag is set to NO then the documentation +# will be excluded. Set it to YES to include the internal documentation. +# The default value is: NO. + +INTERNAL_DOCS = NO + +# With the correct setting of option CASE_SENSE_NAMES doxygen will better be +# able to match the capabilities of the underlying filesystem. In case the +# filesystem is case sensitive (i.e. it supports files in the same directory +# whose names only differ in casing), the option must be set to YES to properly +# deal with such files in case they appear in the input. For filesystems that +# are not case sensitive the option should be be set to NO to properly deal with +# output files written for symbols that only differ in casing, such as for two +# classes, one named CLASS and the other named Class, and to also support +# references to files without having to specify the exact matching casing. On +# Windows (including Cygwin) and MacOS, users should typically set this option +# to NO, whereas on Linux or other Unix flavors it should typically be set to +# YES. +# The default value is: system dependent. + +CASE_SENSE_NAMES = NO + +# If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with +# their full class and namespace scopes in the documentation. If set to YES, the +# scope will be hidden. +# The default value is: NO. + +HIDE_SCOPE_NAMES = NO + +# If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will +# append additional text to a page's title, such as Class Reference. If set to +# YES the compound reference will be hidden. +# The default value is: NO. + +HIDE_COMPOUND_REFERENCE= NO + +# If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of +# the files that are included by a file in the documentation of that file. +# The default value is: YES. + +SHOW_INCLUDE_FILES = YES + +# If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each +# grouped member an include statement to the documentation, telling the reader +# which file to include in order to use the member. +# The default value is: NO. + +SHOW_GROUPED_MEMB_INC = NO + +# If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include +# files with double quotes in the documentation rather than with sharp brackets. +# The default value is: NO. + +FORCE_LOCAL_INCLUDES = NO + +# If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the +# documentation for inline members. +# The default value is: YES. + +INLINE_INFO = YES + +# If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the +# (detailed) documentation of file and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. +# The default value is: YES. + +SORT_MEMBER_DOCS = YES + +# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief +# descriptions of file, namespace and class members alphabetically by member +# name. If set to NO, the members will appear in declaration order. Note that +# this will also influence the order of the classes in the class list. +# The default value is: NO. + +SORT_BRIEF_DOCS = YES + +# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the +# (brief and detailed) documentation of class members so that constructors and +# destructors are listed first. If set to NO the constructors will appear in the +# respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. +# Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief +# member documentation. +# Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting +# detailed member documentation. +# The default value is: NO. + +SORT_MEMBERS_CTORS_1ST = YES + +# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy +# of group names into alphabetical order. If set to NO the group names will +# appear in their defined order. +# The default value is: NO. + +SORT_GROUP_NAMES = NO + +# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by +# fully-qualified names, including namespaces. If set to NO, the class list will +# be sorted only by class name, not including the namespace part. +# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. +# Note: This option applies only to the class list, not to the alphabetical +# list. +# The default value is: NO. + +SORT_BY_SCOPE_NAME = NO + +# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper +# type resolution of all parameters of a function it will reject a match between +# the prototype and the implementation of a member function even if there is +# only one candidate or it is obvious which candidate to choose by doing a +# simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still +# accept a match between prototype and implementation in such cases. +# The default value is: NO. + +STRICT_PROTO_MATCHING = NO + +# The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo +# list. This list is created by putting \todo commands in the documentation. +# The default value is: YES. + +GENERATE_TODOLIST = YES + +# The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test +# list. This list is created by putting \test commands in the documentation. +# The default value is: YES. + +GENERATE_TESTLIST = YES + +# The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug +# list. This list is created by putting \bug commands in the documentation. +# The default value is: YES. + +GENERATE_BUGLIST = YES + +# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) +# the deprecated list. This list is created by putting \deprecated commands in +# the documentation. +# The default value is: YES. + +GENERATE_DEPRECATEDLIST= YES + +# The ENABLED_SECTIONS tag can be used to enable conditional documentation +# sections, marked by \if ... \endif and \cond +# ... \endcond blocks. + +ENABLED_SECTIONS = + +# The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the +# initial value of a variable or macro / define can have for it to appear in the +# documentation. If the initializer consists of more lines than specified here +# it will be hidden. Use a value of 0 to hide initializers completely. The +# appearance of the value of individual variables and macros / defines can be +# controlled using \showinitializer or \hideinitializer command in the +# documentation regardless of this setting. +# Minimum value: 0, maximum value: 10000, default value: 30. + +MAX_INITIALIZER_LINES = 30 + +# Set the SHOW_USED_FILES tag to NO to disable the list of files generated at +# the bottom of the documentation of classes and structs. If set to YES, the +# list will mention the files that were used to generate the documentation. +# The default value is: YES. + +SHOW_USED_FILES = YES + +# Set the SHOW_FILES tag to NO to disable the generation of the Files page. This +# will remove the Files entry from the Quick Index and from the Folder Tree View +# (if specified). +# The default value is: YES. + +SHOW_FILES = YES + +# Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces +# page. This will remove the Namespaces entry from the Quick Index and from the +# Folder Tree View (if specified). +# The default value is: YES. + +SHOW_NAMESPACES = YES + +# The FILE_VERSION_FILTER tag can be used to specify a program or script that +# doxygen should invoke to get the current version for each file (typically from +# the version control system). Doxygen will invoke the program by executing (via +# popen()) the command command input-file, where command is the value of the +# FILE_VERSION_FILTER tag, and input-file is the name of an input file provided +# by doxygen. Whatever the program writes to standard output is used as the file +# version. For an example see the documentation. + +FILE_VERSION_FILTER = + +# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed +# by doxygen. The layout file controls the global structure of the generated +# output files in an output format independent way. To create the layout file +# that represents doxygen's defaults, run doxygen with the -l option. You can +# optionally specify a file name after the option, if omitted DoxygenLayout.xml +# will be used as the name of the layout file. +# +# Note that if you run doxygen from a directory containing a file called +# DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE +# tag is left empty. + +LAYOUT_FILE = + +# The CITE_BIB_FILES tag can be used to specify one or more bib files containing +# the reference definitions. This must be a list of .bib files. The .bib +# extension is automatically appended if omitted. This requires the bibtex tool +# to be installed. See also https://en.wikipedia.org/wiki/BibTeX for more info. +# For LaTeX the style of the bibliography can be controlled using +# LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the +# search path. See also \cite for info how to create references. + +CITE_BIB_FILES = + +#--------------------------------------------------------------------------- +# Configuration options related to warning and progress messages +#--------------------------------------------------------------------------- + +# The QUIET tag can be used to turn on/off the messages that are generated to +# standard output by doxygen. If QUIET is set to YES this implies that the +# messages are off. +# The default value is: NO. + +QUIET = NO + +# The WARNINGS tag can be used to turn on/off the warning messages that are +# generated to standard error (stderr) by doxygen. If WARNINGS is set to YES +# this implies that the warnings are on. +# +# Tip: Turn warnings on while writing the documentation. +# The default value is: YES. + +WARNINGS = YES + +# If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate +# warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag +# will automatically be disabled. +# The default value is: YES. + +WARN_IF_UNDOCUMENTED = YES + +# If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for +# potential errors in the documentation, such as not documenting some parameters +# in a documented function, or documenting parameters that don't exist or using +# markup commands wrongly. +# The default value is: YES. + +WARN_IF_DOC_ERROR = YES + +# This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that +# are documented, but have no documentation for their parameters or return +# value. If set to NO, doxygen will only warn about wrong or incomplete +# parameter documentation, but not about the absence of documentation. If +# EXTRACT_ALL is set to YES then this flag will automatically be disabled. +# The default value is: NO. + +WARN_NO_PARAMDOC = YES + +# If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when +# a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS +# then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but +# at the end of the doxygen process doxygen will return with a non-zero status. +# Possible values are: NO, YES and FAIL_ON_WARNINGS. +# The default value is: NO. + +WARN_AS_ERROR = NO + +# The WARN_FORMAT tag determines the format of the warning messages that doxygen +# can produce. The string should contain the $file, $line, and $text tags, which +# will be replaced by the file and line number from which the warning originated +# and the warning text. Optionally the format may contain $version, which will +# be replaced by the version of the file (if it could be obtained via +# FILE_VERSION_FILTER) +# The default value is: $file:$line: $text. + +WARN_FORMAT = "$file:$line: $text" + +# The WARN_LOGFILE tag can be used to specify a file to which warning and error +# messages should be written. If left blank the output is written to standard +# error (stderr). + +WARN_LOGFILE = doxygen.errors + +#--------------------------------------------------------------------------- +# Configuration options related to the input files +#--------------------------------------------------------------------------- + +# The INPUT tag is used to specify the files and/or directories that contain +# documented source files. You may enter file names like myfile.cpp or +# directories like /usr/src/myproject. Separate the files or directories with +# spaces. See also FILE_PATTERNS and EXTENSION_MAPPING +# Note: If this tag is empty the current directory is searched. + +INPUT = src/catch2 + +# This tag can be used to specify the character encoding of the source files +# that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses +# libiconv (or the iconv built into libc) for the transcoding. See the libiconv +# documentation (see: +# https://www.gnu.org/software/libiconv/) for the list of possible encodings. +# The default value is: UTF-8. + +INPUT_ENCODING = UTF-8 + +# If the value of the INPUT tag contains directories, you can use the +# FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and +# *.h) to filter out the source-files in the directories. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# read by doxygen. +# +# Note the list of default checked file patterns might differ from the list of +# default file extension mappings. +# +# If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, +# *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, +# *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, +# *.m, *.markdown, *.md, *.mm, *.dox (to be provided as doxygen C comment), +# *.py, *.pyw, *.f90, *.f95, *.f03, *.f08, *.f18, *.f, *.for, *.vhd, *.vhdl, +# *.ucf, *.qsf and *.ice. + +FILE_PATTERNS = *.c \ + *.cc \ + *.cxx \ + *.cpp \ + *.c++ \ + *.java \ + *.ii \ + *.ixx \ + *.ipp \ + *.i++ \ + *.inl \ + *.idl \ + *.ddl \ + *.odl \ + *.h \ + *.hh \ + *.hxx \ + *.hpp \ + *.h++ \ + *.cs \ + *.d \ + *.php \ + *.php4 \ + *.php5 \ + *.phtml \ + *.inc \ + *.m \ + *.markdown \ + *.md \ + *.mm \ + *.dox \ + *.py \ + *.pyw \ + *.f90 \ + *.f95 \ + *.f03 \ + *.f08 \ + *.f18 \ + *.f \ + *.for \ + *.vhd \ + *.vhdl \ + *.ucf \ + *.qsf \ + *.ice + +# The RECURSIVE tag can be used to specify whether or not subdirectories should +# be searched for input files as well. +# The default value is: NO. + +RECURSIVE = YES + +# The EXCLUDE tag can be used to specify files and/or directories that should be +# excluded from the INPUT source files. This way you can easily exclude a +# subdirectory from a directory tree whose root is specified with the INPUT tag. +# +# Note that relative paths are relative to the directory from which doxygen is +# run. + +EXCLUDE = + +# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or +# directories that are symbolic links (a Unix file system feature) are excluded +# from the input. +# The default value is: NO. + +EXCLUDE_SYMLINKS = NO + +# If the value of the INPUT tag contains directories, you can use the +# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude +# certain files from those directories. +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories for example use the pattern */test/* + +EXCLUDE_PATTERNS = */lib/* + +# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names +# (namespaces, classes, functions, etc.) that should be excluded from the +# output. The symbol name can be a fully qualified name, a word, or if the +# wildcard * is used, a substring. Examples: ANamespace, AClass, +# AClass::ANamespace, ANamespace::*Test +# +# Note that the wildcards are matched against the file with absolute path, so to +# exclude all test directories use the pattern */test/* + +EXCLUDE_SYMBOLS = + +# The EXAMPLE_PATH tag can be used to specify one or more files or directories +# that contain example code fragments that are included (see the \include +# command). + +EXAMPLE_PATH = + +# If the value of the EXAMPLE_PATH tag contains directories, you can use the +# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and +# *.h) to filter out the source-files in the directories. If left blank all +# files are included. + +EXAMPLE_PATTERNS = + +# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be +# searched for input files to be used with the \include or \dontinclude commands +# irrespective of the value of the RECURSIVE tag. +# The default value is: NO. + +EXAMPLE_RECURSIVE = NO + +# The IMAGE_PATH tag can be used to specify one or more files or directories +# that contain images that are to be included in the documentation (see the +# \image command). + +IMAGE_PATH = + +# The INPUT_FILTER tag can be used to specify a program that doxygen should +# invoke to filter for each input file. Doxygen will invoke the filter program +# by executing (via popen()) the command: +# +# +# +# where is the value of the INPUT_FILTER tag, and is the +# name of an input file. Doxygen will then use the output that the filter +# program writes to standard output. If FILTER_PATTERNS is specified, this tag +# will be ignored. +# +# Note that the filter must not add or remove lines; it is applied before the +# code is scanned, but not when the output code is generated. If lines are added +# or removed, the anchors will not be placed correctly. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +INPUT_FILTER = + +# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern +# basis. Doxygen will compare the file name with each pattern and apply the +# filter if there is a match. The filters are a list of the form: pattern=filter +# (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how +# filters are used. If the FILTER_PATTERNS tag is empty or if none of the +# patterns match the file name, INPUT_FILTER is applied. +# +# Note that for custom extensions or not directly supported extensions you also +# need to set EXTENSION_MAPPING for the extension otherwise the files are not +# properly processed by doxygen. + +FILTER_PATTERNS = + +# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using +# INPUT_FILTER) will also be used to filter the input files that are used for +# producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). +# The default value is: NO. + +FILTER_SOURCE_FILES = NO + +# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file +# pattern. A pattern will override the setting for FILTER_PATTERN (if any) and +# it is also possible to disable source filtering for a specific pattern using +# *.ext= (so without naming a filter). +# This tag requires that the tag FILTER_SOURCE_FILES is set to YES. + +FILTER_SOURCE_PATTERNS = + +# If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that +# is part of the input, its contents will be placed on the main page +# (index.html). This can be useful if you have a project on for instance GitHub +# and want to reuse the introduction page also for the doxygen output. + +USE_MDFILE_AS_MAINPAGE = + +#--------------------------------------------------------------------------- +# Configuration options related to source browsing +#--------------------------------------------------------------------------- + +# If the SOURCE_BROWSER tag is set to YES then a list of source files will be +# generated. Documented entities will be cross-referenced with these sources. +# +# Note: To get rid of all source code in the generated output, make sure that +# also VERBATIM_HEADERS is set to NO. +# The default value is: NO. + +SOURCE_BROWSER = NO + +# Setting the INLINE_SOURCES tag to YES will include the body of functions, +# classes and enums directly into the documentation. +# The default value is: NO. + +INLINE_SOURCES = NO + +# Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any +# special comment blocks from generated source code fragments. Normal C, C++ and +# Fortran comments will always remain visible. +# The default value is: YES. + +STRIP_CODE_COMMENTS = NO + +# If the REFERENCED_BY_RELATION tag is set to YES then for each documented +# entity all documented functions referencing it will be listed. +# The default value is: NO. + +REFERENCED_BY_RELATION = NO + +# If the REFERENCES_RELATION tag is set to YES then for each documented function +# all documented entities called/used by that function will be listed. +# The default value is: NO. + +REFERENCES_RELATION = NO + +# If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set +# to YES then the hyperlinks from functions in REFERENCES_RELATION and +# REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will +# link to the documentation. +# The default value is: YES. + +REFERENCES_LINK_SOURCE = NO + +# If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the +# source code will show a tooltip with additional information such as prototype, +# brief description and links to the definition and documentation. Since this +# will make the HTML file larger and loading of large files a bit slower, you +# can opt to disable this feature. +# The default value is: YES. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +SOURCE_TOOLTIPS = YES + +# If the USE_HTAGS tag is set to YES then the references to source code will +# point to the HTML generated by the htags(1) tool instead of doxygen built-in +# source browser. The htags tool is part of GNU's global source tagging system +# (see https://www.gnu.org/software/global/global.html). You will need version +# 4.8.6 or higher. +# +# To use it do the following: +# - Install the latest version of global +# - Enable SOURCE_BROWSER and USE_HTAGS in the configuration file +# - Make sure the INPUT points to the root of the source tree +# - Run doxygen as normal +# +# Doxygen will invoke htags (and that will in turn invoke gtags), so these +# tools must be available from the command line (i.e. in the search path). +# +# The result: instead of the source browser generated by doxygen, the links to +# source code will now point to the output of htags. +# The default value is: NO. +# This tag requires that the tag SOURCE_BROWSER is set to YES. + +USE_HTAGS = NO + +# If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a +# verbatim copy of the header file for each class for which an include is +# specified. Set to NO to disable this. +# See also: Section \class. +# The default value is: YES. + +VERBATIM_HEADERS = YES + +# If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the +# clang parser (see: +# http://clang.llvm.org/) for more accurate parsing at the cost of reduced +# performance. This can be particularly helpful with template rich C++ code for +# which doxygen's built-in parser lacks the necessary type information. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. +# The default value is: NO. + +CLANG_ASSISTED_PARSING = NO + +# If clang assisted parsing is enabled and the CLANG_ADD_INC_PATHS tag is set to +# YES then doxygen will add the directory of each input to the include path. +# The default value is: YES. + +CLANG_ADD_INC_PATHS = YES + +# If clang assisted parsing is enabled you can provide the compiler with command +# line options that you would normally use when invoking the compiler. Note that +# the include paths will already be set by doxygen for the files and directories +# specified with INPUT and INCLUDE_PATH. +# This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. + +CLANG_OPTIONS = + +# If clang assisted parsing is enabled you can provide the clang parser with the +# path to the directory containing a file called compile_commands.json. This +# file is the compilation database (see: +# http://clang.llvm.org/docs/HowToSetupToolingForLLVM.html) containing the +# options used when the source files were built. This is equivalent to +# specifying the -p option to a clang tool, such as clang-check. These options +# will then be passed to the parser. Any options specified with CLANG_OPTIONS +# will be added as well. +# Note: The availability of this option depends on whether or not doxygen was +# generated with the -Duse_libclang=ON option for CMake. + +CLANG_DATABASE_PATH = + +#--------------------------------------------------------------------------- +# Configuration options related to the alphabetical class index +#--------------------------------------------------------------------------- + +# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all +# compounds will be generated. Enable this if the project contains a lot of +# classes, structs, unions or interfaces. +# The default value is: YES. + +ALPHABETICAL_INDEX = YES + +# In case all classes in a project start with a common prefix, all classes will +# be put under the same header in the alphabetical index. The IGNORE_PREFIX tag +# can be used to specify a prefix (or a list of prefixes) that should be ignored +# while generating the index headers. +# This tag requires that the tag ALPHABETICAL_INDEX is set to YES. + +IGNORE_PREFIX = + +#--------------------------------------------------------------------------- +# Configuration options related to the HTML output +#--------------------------------------------------------------------------- + +# If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output +# The default value is: YES. + +GENERATE_HTML = YES + +# The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a +# relative path is entered the value of OUTPUT_DIRECTORY will be put in front of +# it. +# The default directory is: html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_OUTPUT = html + +# The HTML_FILE_EXTENSION tag can be used to specify the file extension for each +# generated HTML page (for example: .htm, .php, .asp). +# The default value is: .html. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FILE_EXTENSION = .html + +# The HTML_HEADER tag can be used to specify a user-defined HTML header file for +# each generated HTML page. If the tag is left blank doxygen will generate a +# standard header. +# +# To get valid HTML the header file that includes any scripts and style sheets +# that doxygen needs, which is dependent on the configuration options used (e.g. +# the setting GENERATE_TREEVIEW). It is highly recommended to start with a +# default header using +# doxygen -w html new_header.html new_footer.html new_stylesheet.css +# YourConfigFile +# and then modify the file new_header.html. See also section "Doxygen usage" +# for information on how to generate the default header that doxygen normally +# uses. +# Note: The header is subject to change so you typically have to regenerate the +# default header when upgrading to a newer version of doxygen. For a description +# of the possible markers and block names see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_HEADER = + +# The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each +# generated HTML page. If the tag is left blank doxygen will generate a standard +# footer. See HTML_HEADER for more information on how to generate a default +# footer and what special commands can be used inside the footer. See also +# section "Doxygen usage" for information on how to generate the default footer +# that doxygen normally uses. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FOOTER = + +# The HTML_STYLESHEET tag can be used to specify a user-defined cascading style +# sheet that is used by each HTML page. It can be used to fine-tune the look of +# the HTML output. If left blank doxygen will generate a default style sheet. +# See also section "Doxygen usage" for information on how to generate the style +# sheet that doxygen normally uses. +# Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as +# it is more robust and this tag (HTML_STYLESHEET) will in the future become +# obsolete. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_STYLESHEET = + +# The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined +# cascading style sheets that are included after the standard style sheets +# created by doxygen. Using this option one can overrule certain style aspects. +# This is preferred over using HTML_STYLESHEET since it does not replace the +# standard style sheet and is therefore more robust against future updates. +# Doxygen will copy the style sheet files to the output directory. +# Note: The order of the extra style sheet files is of importance (e.g. the last +# style sheet in the list overrules the setting of the previous ones in the +# list). For an example see the documentation. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_STYLESHEET = + +# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or +# other source files which should be copied to the HTML output directory. Note +# that these files will be copied to the base HTML output directory. Use the +# $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these +# files. In the HTML_STYLESHEET file, use the file name only. Also note that the +# files will be copied as-is; there are no commands or markers available. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_EXTRA_FILES = + +# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen +# will adjust the colors in the style sheet and background images according to +# this color. Hue is specified as an angle on a colorwheel, see +# https://en.wikipedia.org/wiki/Hue for more information. For instance the value +# 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 +# purple, and 360 is red again. +# Minimum value: 0, maximum value: 359, default value: 220. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_HUE = 220 + +# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors +# in the HTML output. For a value of 0 the output will use grayscales only. A +# value of 255 will produce the most vivid colors. +# Minimum value: 0, maximum value: 255, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_SAT = 100 + +# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the +# luminance component of the colors in the HTML output. Values below 100 +# gradually make the output lighter, whereas values above 100 make the output +# darker. The value divided by 100 is the actual gamma applied, so 80 represents +# a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not +# change the gamma. +# Minimum value: 40, maximum value: 240, default value: 80. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_COLORSTYLE_GAMMA = 80 + +# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML +# page will contain the date and time when the page was generated. Setting this +# to YES can help to show when doxygen was last run and thus if the +# documentation is up to date. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_TIMESTAMP = NO + +# If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML +# documentation will contain a main index with vertical navigation menus that +# are dynamically created via JavaScript. If disabled, the navigation index will +# consists of multiple levels of tabs that are statically embedded in every HTML +# page. Disable this option to support browsers that do not have JavaScript, +# like the Qt help browser. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_MENUS = YES + +# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML +# documentation will contain sections that can be hidden and shown after the +# page has loaded. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_DYNAMIC_SECTIONS = NO + +# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries +# shown in the various tree structured indices initially; the user can expand +# and collapse entries dynamically later on. Doxygen will expand the tree to +# such a level that at most the specified number of entries are visible (unless +# a fully collapsed tree already exceeds this amount). So setting the number of +# entries 1 will produce a full collapsed tree by default. 0 is a special value +# representing an infinite number of entries and will result in a full expanded +# tree by default. +# Minimum value: 0, maximum value: 9999, default value: 100. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_INDEX_NUM_ENTRIES = 100 + +# If the GENERATE_DOCSET tag is set to YES, additional index files will be +# generated that can be used as input for Apple's Xcode 3 integrated development +# environment (see: +# https://developer.apple.com/xcode/), introduced with OSX 10.5 (Leopard). To +# create a documentation set, doxygen will generate a Makefile in the HTML +# output directory. Running make will produce the docset in that directory and +# running make install will install the docset in +# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at +# startup. See https://developer.apple.com/library/archive/featuredarticles/Doxy +# genXcode/_index.html for more information. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_DOCSET = NO + +# This tag determines the name of the docset feed. A documentation feed provides +# an umbrella under which multiple documentation sets from a single provider +# (such as a company or product suite) can be grouped. +# The default value is: Doxygen generated docs. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_FEEDNAME = "Doxygen generated docs" + +# This tag specifies a string that should uniquely identify the documentation +# set bundle. This should be a reverse domain-name style string, e.g. +# com.mycompany.MyDocSet. Doxygen will append .docset to the name. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_BUNDLE_ID = org.doxygen.Project + +# The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify +# the documentation publisher. This should be a reverse domain-name style +# string, e.g. com.mycompany.MyDocSet.documentation. +# The default value is: org.doxygen.Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_ID = org.doxygen.Publisher + +# The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. +# The default value is: Publisher. +# This tag requires that the tag GENERATE_DOCSET is set to YES. + +DOCSET_PUBLISHER_NAME = Publisher + +# If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three +# additional HTML index files: index.hhp, index.hhc, and index.hhk. The +# index.hhp is a project file that can be read by Microsoft's HTML Help Workshop +# (see: +# https://www.microsoft.com/en-us/download/details.aspx?id=21138) on Windows. +# +# The HTML Help Workshop contains a compiler that can convert all HTML output +# generated by doxygen into a single compiled HTML file (.chm). Compiled HTML +# files are now used as the Windows 98 help format, and will replace the old +# Windows help format (.hlp) on all Windows platforms in the future. Compressed +# HTML files also contain an index, a table of contents, and you can search for +# words in the documentation. The HTML workshop also contains a viewer for +# compressed HTML files. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_HTMLHELP = NO + +# The CHM_FILE tag can be used to specify the file name of the resulting .chm +# file. You can add a path in front of the file if the result should not be +# written to the html output directory. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_FILE = + +# The HHC_LOCATION tag can be used to specify the location (absolute path +# including file name) of the HTML help compiler (hhc.exe). If non-empty, +# doxygen will try to run the HTML help compiler on the generated index.hhp. +# The file has to be specified with full path. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +HHC_LOCATION = + +# The GENERATE_CHI flag controls if a separate .chi index file is generated +# (YES) or that it should be included in the main .chm file (NO). +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +GENERATE_CHI = NO + +# The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) +# and project file content. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +CHM_INDEX_ENCODING = + +# The BINARY_TOC flag controls whether a binary table of contents is generated +# (YES) or a normal table of contents (NO) in the .chm file. Furthermore it +# enables the Previous and Next buttons. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +BINARY_TOC = NO + +# The TOC_EXPAND flag can be set to YES to add extra items for group members to +# the table of contents of the HTML help documentation and to the tree view. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTMLHELP is set to YES. + +TOC_EXPAND = NO + +# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and +# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that +# can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help +# (.qch) of the generated HTML documentation. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_QHP = NO + +# If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify +# the file name of the resulting .qch file. The path specified is relative to +# the HTML output folder. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QCH_FILE = + +# The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help +# Project output. For more information please see Qt Help Project / Namespace +# (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#namespace). +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_NAMESPACE = org.doxygen.Project + +# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt +# Help Project output. For more information please see Qt Help Project / Virtual +# Folders (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#virtual-folders). +# The default value is: doc. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_VIRTUAL_FOLDER = doc + +# If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom +# filter to add. For more information please see Qt Help Project / Custom +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_NAME = + +# The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the +# custom filter to add. For more information please see Qt Help Project / Custom +# Filters (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#custom-filters). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_CUST_FILTER_ATTRS = + +# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this +# project's filter section matches. Qt Help Project / Filter Attributes (see: +# https://doc.qt.io/archives/qt-4.8/qthelpproject.html#filter-attributes). +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHP_SECT_FILTER_ATTRS = + +# The QHG_LOCATION tag can be used to specify the location (absolute path +# including file name) of Qt's qhelpgenerator. If non-empty doxygen will try to +# run qhelpgenerator on the generated .qhp file. +# This tag requires that the tag GENERATE_QHP is set to YES. + +QHG_LOCATION = + +# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be +# generated, together with the HTML files, they form an Eclipse help plugin. To +# install this plugin and make it available under the help contents menu in +# Eclipse, the contents of the directory containing the HTML and XML files needs +# to be copied into the plugins directory of eclipse. The name of the directory +# within the plugins directory should be the same as the ECLIPSE_DOC_ID value. +# After copying Eclipse needs to be restarted before the help appears. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_ECLIPSEHELP = NO + +# A unique identifier for the Eclipse help plugin. When installing the plugin +# the directory name containing the HTML and XML files should also have this +# name. Each documentation set should have its own identifier. +# The default value is: org.doxygen.Project. +# This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. + +ECLIPSE_DOC_ID = org.doxygen.Project + +# If you want full control over the layout of the generated HTML pages it might +# be necessary to disable the index and replace it with your own. The +# DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top +# of each HTML page. A value of NO enables the index and the value YES disables +# it. Since the tabs in the index contain the same information as the navigation +# tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +DISABLE_INDEX = NO + +# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index +# structure should be generated to display hierarchical information. If the tag +# value is set to YES, a side panel will be generated containing a tree-like +# index structure (just like the one that is generated for HTML Help). For this +# to work a browser that supports JavaScript, DHTML, CSS and frames is required +# (i.e. any modern browser). Windows users are probably better off using the +# HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can +# further fine-tune the look of the index. As an example, the default style +# sheet generated by doxygen has an example that shows how to put an image at +# the root of the tree instead of the PROJECT_NAME. Since the tree basically has +# the same information as the tab index, you could consider setting +# DISABLE_INDEX to YES when enabling this option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +GENERATE_TREEVIEW = NO + +# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that +# doxygen will group on one line in the generated HTML documentation. +# +# Note that a value of 0 will completely suppress the enum values from appearing +# in the overview section. +# Minimum value: 0, maximum value: 20, default value: 4. +# This tag requires that the tag GENERATE_HTML is set to YES. + +ENUM_VALUES_PER_LINE = 4 + +# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used +# to set the initial width (in pixels) of the frame in which the tree is shown. +# Minimum value: 0, maximum value: 1500, default value: 250. +# This tag requires that the tag GENERATE_HTML is set to YES. + +TREEVIEW_WIDTH = 250 + +# If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to +# external symbols imported via tag files in a separate window. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +EXT_LINKS_IN_WINDOW = NO + +# If the HTML_FORMULA_FORMAT option is set to svg, doxygen will use the pdf2svg +# tool (see https://github.com/dawbarton/pdf2svg) or inkscape (see +# https://inkscape.org) to generate formulas as SVG images instead of PNGs for +# the HTML output. These images will generally look nicer at scaled resolutions. +# Possible values are: png (the default) and svg (looks nicer but requires the +# pdf2svg or inkscape tool). +# The default value is: png. +# This tag requires that the tag GENERATE_HTML is set to YES. + +HTML_FORMULA_FORMAT = png + +# Use this tag to change the font size of LaTeX formulas included as images in +# the HTML documentation. When you change the font size after a successful +# doxygen run you need to manually remove any form_*.png images from the HTML +# output directory to force them to be regenerated. +# Minimum value: 8, maximum value: 50, default value: 10. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_FONTSIZE = 10 + +# Use the FORMULA_TRANSPARENT tag to determine whether or not the images +# generated for formulas are transparent PNGs. Transparent PNGs are not +# supported properly for IE 6.0, but are supported on all modern browsers. +# +# Note that when changing this option you need to delete any form_*.png files in +# the HTML output directory before the changes have effect. +# The default value is: YES. +# This tag requires that the tag GENERATE_HTML is set to YES. + +FORMULA_TRANSPARENT = YES + +# The FORMULA_MACROFILE can contain LaTeX \newcommand and \renewcommand commands +# to create new LaTeX commands to be used in formulas as building blocks. See +# the section "Including formulas" for details. + +FORMULA_MACROFILE = + +# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see +# https://www.mathjax.org) which uses client side JavaScript for the rendering +# instead of using pre-rendered bitmaps. Use this if you do not have LaTeX +# installed or if you want to formulas look prettier in the HTML output. When +# enabled you may also need to install MathJax separately and configure the path +# to it using the MATHJAX_RELPATH option. +# The default value is: NO. +# This tag requires that the tag GENERATE_HTML is set to YES. + +USE_MATHJAX = YES + +# When MathJax is enabled you can set the default output format to be used for +# the MathJax output. See the MathJax site (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. +# Possible values are: HTML-CSS (which is slower, but has the best +# compatibility), NativeMML (i.e. MathML) and SVG. +# The default value is: HTML-CSS. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_FORMAT = HTML-CSS + +# When MathJax is enabled you need to specify the location relative to the HTML +# output directory using the MATHJAX_RELPATH option. The destination directory +# should contain the MathJax.js script. For instance, if the mathjax directory +# is located at the same level as the HTML output directory, then +# MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax +# Content Delivery Network so you can quickly see the result without installing +# MathJax. However, it is strongly recommended to install a local copy of +# MathJax from https://www.mathjax.org before deployment. +# The default value is: https://cdn.jsdelivr.net/npm/mathjax@2. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest + +# The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax +# extension names that should be enabled during MathJax rendering. For example +# MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_EXTENSIONS = TeX/AMSmath \ + TeX/AMSsymbols + +# The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces +# of code that will be used on startup of the MathJax code. See the MathJax site +# (see: +# http://docs.mathjax.org/en/v2.7-latest/output.html) for more details. For an +# example see the documentation. +# This tag requires that the tag USE_MATHJAX is set to YES. + +MATHJAX_CODEFILE = + +# When the SEARCHENGINE tag is enabled doxygen will generate a search box for +# the HTML output. The underlying search engine uses javascript and DHTML and +# should work on any modern browser. Note that when using HTML help +# (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) +# there is already a search function so this one should typically be disabled. +# For large projects the javascript based search engine can be slow, then +# enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to +# search using the keyboard; to jump to the search box use + S +# (what the is depends on the OS and browser, but it is typically +# , /