diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000000000000000000000000000000000000..e999975e0eb358b01dce85745bed93935780a195 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "highway"] + path = highway + url = https://github.com/cambridgeconsultants/aeroway + branch = aeroway_upstream diff --git a/CMakeLists.txt b/CMakeLists.txt index 1124e6031e4c8fde73fa72202a4a62168ecdef15..448c44bbe9857990ddf363103338776ef6fb4a66 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,7 +29,8 @@ option(ARMRAL_ENABLE_COVERAGE option(BUILD_SIMULATION "Enable building channel simulation programs" ON) set(ARMRAL_ARCH NEON - CACHE STRING "The architecture to build for ('NEON' or 'SVE2')") + CACHE STRING + "The architecture to build for ('NEON', 'SVE', 'SVE2' or 'HWY')") set_property(CACHE ARMRAL_ARCH PROPERTY STRINGS "NEON" "SVE2") set(ARMRAL_LIB_SOURCES @@ -267,6 +268,11 @@ if(CMAKE_VERSION VERSION_GREATER 3.15) set(JOB_POOL_CONSOLE JOB_POOL console) endif() +if(ARMRAL_ARCH STREQUAL "HWY") + include(armral_hwy.cmake) + return() +endif() + if(NOT ARMRAL_OPT_FLAGS AND NOT ARMRAL_OVERRIDE_COMPILE_FLAGS) # If the optimization flags are already set, don't try and guess what they # should be. diff --git a/CREDITS.md b/CREDITS.md index 0271d77b8ea30bc4da7ed29edf1cb040d2e08f64..e883c27fcfdc2b8835362c2ad9f787cb4a1ed9eb 100644 --- a/CREDITS.md +++ b/CREDITS.md @@ -2,6 +2,17 @@ In addition to the primary development being done by Arm, the following people and organizations have contributed to Arm RAN Acceleration Library: +- Addition of the Google Highway crc implementation in + `src/UpperPHY/CRC/highway/crc_common.hpp` was contributed by + Cambridge Consultants. See + . + +- Addition of a Google Highway as a fourth architecture `-DARMRAL_ARCH=HWY`. + Enabling future development using Google Highway platform agnostic + intrinsic implementations was contributed upstream by Cambridge + Consultants. See + . + - Work on `armral_ldpc_rate_recovery` to correctly set the log-likelihood ratios of filler bits was contributed upstream by 4g5g Consultants. See @@ -32,4 +43,4 @@ Acceleration Library: - Work on `armral_ldpc_rate_matching` and `armral_ldpc_rate_recovery` to support soft buffer sizes was contributed upstream by 4g5g Consultants. See - . + . \ No newline at end of file diff --git a/armral_hwy.cmake b/armral_hwy.cmake new file mode 100644 index 0000000000000000000000000000000000000000..6808b65e4003ce59eea12b353619587ac087645b --- /dev/null +++ b/armral_hwy.cmake @@ -0,0 +1,695 @@ +cmake_minimum_required(VERSION 3.10) + +# TODO possibly switch highway from a submodule to ExternalProject_Add +set(HWY_ENABLE_CONTRIB + OFF + CACHE BOOL "Include HWY contrib/ folder") +set(HWY_ENABLE_EXAMPLES + OFF + CACHE BOOL "Build HWY examples") +# set(HWY_ENABLE_INSTALL ON CACHE BOOL "Install HWY library") +set(HWY_ENABLE_TESTS + OFF + CACHE BOOL "Enable HWY tests") + +include(CheckCXXCompilerFlag) +check_cxx_compiler_flag("-march=native" COMPILER_OPT_ARCH_NATIVE_SUPPORTED) + +if(ARMRAL_OPT_FLAGS) + set(ARMRAL_ARCH_COMPILE_OPTIONS ${ARMRAL_OPT_FLAGS}) + # handle configuring static dispatch for a specified -m string + set(HWY_COMPILE_ONLY_STATIC + ON + CACHE BOOL "") + add_compile_options(${ARMRAL_ARCH_COMPILE_OPTIONS}) +elseif(COMPILER_OPT_ARCH_NATIVE_SUPPORTED) + # pick a less conservative baseline where possible + add_compile_options("-march=native") +endif() +add_subdirectory(highway) + +set(ARMRAL_ARCH_TYPE "ARMRAL_ARCH_HWY=1") + +if(ARMRAL_OPT_FLAGS) + target_compile_definitions(hwy PUBLIC HWY_COMPILE_ONLY_STATIC) +endif() + +# The PMULL instruction requires the AES extension which is only available under +# NEON and SVE2 on aarch64. We have disabled SVE for all Arm platforms when +# PMULL is required; to avoid falling back to (slower) generic implementations +set_property( + SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp + APPEND + PROPERTY COMPILE_DEFINITIONS HWY_DISABLED_TARGETS=HWY_SVE_256|HWY_SVE) + +# GCC recognizes the usage of XOR as an associative operation, then it tries to +# optimize the operation tree in its tree-reassoc pass, but it actually makes +# the performance much worse. Disabling the tree-assoc pass means that the +# compiler uses our carefully balanced operation tree instead. +set_property( + SOURCE ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp + APPEND + PROPERTY COMPILE_OPTIONS $<$:-fno-tree-reassoc>) + +set(ARMRAL_LIB_SOURCES + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_hermitian_mat_inversion_f32.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixInv/arm_cmplx_mat_inversion_f32.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_aah_f32.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_ahb_f32.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_f32.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_matmul_i16_32bit.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_f32.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_cmplx_mat_vec_mult_i16_32bit.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_1sc.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_4sc.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_6sc.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixMult/arm_solve_f32.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/MatrixPseudoInv/arm_cmplx_pseudo_inverse_direct_f32.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_f32_2.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_2_32bit.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorDotProd/arm_cmplx_vecdot_i16_32bit.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_f32_2.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/BasicMathFun/VectorMult/arm_cmplx_vecmul_i16_2.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_compression.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/MuLawCompression/arm_mu_law_decompression.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_compression.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockFloat/arm_block_float_decompression.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_compression.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/DuRuInterface/ORanBlockScaling/arm_block_scaling_decompression.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Correlation/arm_correlation.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/bluestein.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gs.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ab_t_gu.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_gu.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_n_uu.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cf32_ac_t_uu.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ab_t_gu.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_cf32_cs16_ac_n_uu.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cf32_kernel_lookup.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cf32_ac_n_uu.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_cf32_cs16_ac_n_uu.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_cs16_kernel_lookup.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_execute.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_level.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/fft_plan.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FFT/rader_generator.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cf32_decimate_2.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/FIR/arm_fir_filter_cs16_decimate_2.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/Scrambling/arm_scrambling.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/LowerPHY/SeqGenerator/arm_mat_seq_generator.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc11.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc16.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_a.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_b.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc24_c.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/CRC/arm_crc6.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_decoder.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/ConvolutionalEncoder/arm_convolutional_encoder.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Demodulation/arm_demodulation.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_decoder.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_encoder.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_matching.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/LDPC/ldpc_rate_recovery.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Modulation/arm_modulation.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_attachment.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_crc_check.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_decoder.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_encoder.c + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_frozen_bits.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_matching.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_rate_recovery.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_deinterleave.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Polar/arm_polar_subchannel_interleave.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_decoder.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_encoder.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_matching.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/UpperPHY/Turbo/arm_turbo_rate_recovery.cpp + # ${CMAKE_CURRENT_SOURCE_DIR}/src/MatrixFactorizations/SVD/arm_svd.cpp +) + +if(NOT ARMRAL_OVERRIDE_COMPILE_FLAGS) + set(ARMRAL_COMPILER_FLAGS + ${ARMRAL_COMPILER_FLAGS} + $<$:-Wshadow + -Wall + -Wcast-qual> + $<$:-Wshadow + -Wall + -Wcast-qual + -fno-rtti + -fno-exceptions + -std=c++17> + $<$:-Og + -g3 + -ggdb + -fno-omit-frame-pointer>) + # Disable GLIBCXX assertions to avoid introducing dependency on libstdc++ + add_definitions(-D_GLIBCXX_NO_ASSERTIONS) + message(STATUS "Using compilation flags: ${ARMRAL_COMPILER_FLAGS}") +else() + # If the CMAKE_C_FLAGS is set, CMake already deals with putting this on the + # compile line + message(STATUS "Overriding compilation flags with manually set flags") + message(STATUS "CMAKE_C_FLAGS: ${CMAKE_C_FLAGS}") + message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") + set(ARMRAL_COMPILER_FLAGS "") + set(ARMRAL_LINKER_FLAGS "") +endif() + +add_library(armral ${ARMRAL_LIB_SOURCES}) +target_include_directories(armral PUBLIC ${ARMRAL_LIB_INC}) +target_compile_definitions(armral PUBLIC ${ARMRAL_ARCH_TYPE}) +target_compile_options(armral PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS} + ${ARMRAL_COMPILER_FLAGS}) +target_link_libraries(armral PRIVATE ${ARMRAL_LINKER_FLAGS}) + +add_library(armral_utils ${ARMRAL_UTIL_SOURCES}) +target_include_directories(armral_utils PUBLIC ${ARMRAL_LIB_INC}) +target_compile_definitions(armral_utils PUBLIC ${ARMRAL_ARCH_TYPE}) +target_compile_options(armral_utils PRIVATE ${ARMRAL_ARCH_COMPILE_OPTIONS} + ${ARMRAL_COMPILER_FLAGS}) +target_link_libraries(armral_utils PRIVATE ${ARMRAL_LINKER_FLAGS}) + +target_link_libraries(armral PUBLIC hwy) +target_link_libraries(armral_utils PUBLIC hwy) + +if(ARMRAL_SEMIHOSTING) + # When semihosting we need to pass "-DARMRAL_SEMIHOSTING" as a compiler flag, + # so we specify the string "ARMRAL_SEMIHOSTING" rather than the CMake variable + # ARMRAL_SEMIHOSTING + target_compile_definitions(armral PUBLIC "ARMRAL_SEMIHOSTING") + target_compile_definitions(armral_utils PUBLIC "ARMRAL_SEMIHOSTING") +endif() + +include(GNUInstallDirs) +install(TARGETS armral DESTINATION ${CMAKE_INSTALL_LIBDIR}) +install( + DIRECTORY include/ + DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} + FILES_MATCHING + PATTERN "*.h") +install(FILES LICENSE.md THIRD_PARTY_LICENSES.md + DESTINATION ${CMAKE_INSTALL_DATADIR}/licenses/armral) + +if(BUILD_TESTING) + include(CTest) + + if(NOT DEFINED BENCHMARKER_SOURCE_DIR) + set(BENCHMARKER_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) + endif() + if(NOT DEFINED BENCHMARKER_BUILD_DIR) + set(BENCHMARKER_BUILD_DIR ${CMAKE_BINARY_DIR}) + endif() + if(NOT DEFINED BENCHMARKER_RUNNER) + set(BENCHMARKER_RUNNER "${BENCHMARKER_SOURCE_DIR}/bench/default_runner.py") + endif() + + add_custom_target( + check + COMMAND ${CMAKE_CTEST_COMMAND} + WORKING_DIRECTORY ${CMAKE_BINARY_DIR}) + + add_custom_target( + bench + COMMAND + ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR} + ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} + ${JOB_POOL_CONSOLE} + WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR}) + + add_custom_target( + bench_concurrent + COMMAND + ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR} + ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} --concurrent + ${JOB_POOL_CONSOLE} + WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR}) + + add_custom_target( + bench_excel_summary + COMMAND + ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py ${CMAKE_SOURCE_DIR} + ${BENCHMARKER_BUILD_DIR} --runner ${BENCHMARKER_RUNNER} | tee + ${BENCHMARKER_BUILD_DIR}/out.json + COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/python/benchmark_excel_summary.py + ${BENCHMARKER_BUILD_DIR}/out.json ${JOB_POOL_CONSOLE} + WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR}) + + set(ARMRAL_TEST_LINK_LIBRARIES armral armral_utils) + + if(STATIC_TESTING) + set(ARMRAL_LINKER_FLAGS ${ARMRAL_LINKER_FLAGS} -static) + endif() + + # Utility function to add a test + function(add_armral_test TEST_NAME TEST_SOURCE) + # Build the actual test executable itself + add_executable(${TEST_NAME} ${TEST_SOURCE}) + target_link_libraries(${TEST_NAME} ${ARMRAL_TEST_LINK_LIBRARIES} + ${ARMRAL_LINKER_FLAGS}) + target_include_directories(${TEST_NAME} PRIVATE ${ARMRAL_TEST_INC}) + target_compile_options(${TEST_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS} + ${ARMRAL_ARCH_COMPILE_OPTIONS}) + + # Register it as a test, set up dependencies + add_test(NAME ${TEST_NAME} COMMAND ${ARMRAL_TEST_RUNNER} + ${CMAKE_CURRENT_BINARY_DIR}/${TEST_NAME}) + if(ARMRAL_ENABLE_ASAN) + # Avoid slow-downs in newer versions of Address Santizier + # https://github.com/llvm/llvm-project/issues/64190 + set_tests_properties( + ${TEST_NAME} PROPERTIES ENVIRONMENT + "ASAN_OPTIONS=detect_stack_use_after_return=0") + endif() + add_dependencies(check ${TEST_NAME}) + endfunction() + + # Utility function to add a benchmark + function(add_armral_bench BENCH_NAME BENCH_SOURCE) + + # Build the actual bench executable itself + add_executable(bench_${BENCH_NAME} ${BENCH_SOURCE}) + target_link_libraries(bench_${BENCH_NAME} ${ARMRAL_TEST_LINK_LIBRARIES} + ${ARMRAL_LINKER_FLAGS}) + target_include_directories(bench_${BENCH_NAME} PRIVATE ${ARMRAL_TEST_INC}) + target_compile_options(bench_${BENCH_NAME} PRIVATE ${ARMRAL_COMPILER_FLAGS}) + + # Register it as a benchmark, set up dependencies + add_dependencies(bench bench_${BENCH_NAME}) + add_dependencies(bench_concurrent bench_${BENCH_NAME}) + add_dependencies(bench_excel_summary bench_${BENCH_NAME}) + + # Add target for running the benchmark + get_filename_component(BENCH_DIR ${BENCH_SOURCE} DIRECTORY) + add_custom_target( + run_bench_${BENCH_NAME} + COMMAND + ${BENCHMARKER_SOURCE_DIR}/bench/benchmarker.py + ${CMAKE_CURRENT_SOURCE_DIR}/${BENCH_DIR} ${BENCHMARKER_BUILD_DIR} + --runner ${BENCHMARKER_RUNNER} --concurrent ${JOB_POOL_CONSOLE} + WORKING_DIRECTORY ${BENCHMARKER_BUILD_DIR} + DEPENDS bench_${BENCH_NAME}) + endfunction() + + # cmake-format: off +# add_armral_test(matrix_inv_batch test/BasicMathFun/MatrixInv/Batch/main.cpp) +# add_armral_test(matrix_inv_single test/BasicMathFun/MatrixInv/Single/main.cpp) +# add_armral_test(arm_solve +# test/BasicMathFun/MatrixMult/Batch/ArmSolve/main.cpp) +# add_armral_test( +# matrix_vector_mult_batch_16 +# test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/main.cpp) +# add_armral_test( +# matrix_vector_mult_batch_32 +# test/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/main.cpp) +# add_armral_test(matrix_mult_16 +# test/BasicMathFun/MatrixMult/Single/MatrixMult16/main.cpp) +# add_armral_test(matrix_mult_32 +# test/BasicMathFun/MatrixMult/Single/MatrixMult32/main.cpp) +# add_armral_test(matrix_mult_aah_32 +# test/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp) +# add_armral_test(matrix_mult_ahb_32 +# test/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp) +# add_armral_test( +# matrix_vector_mult_single_16 +# test/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/main.cpp) +# add_armral_test( +# matrix_vector_mult_single_32 +# test/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp) +# add_armral_test(matrix_pseudo_inv_direct +# test/BasicMathFun/MatrixPseudoInv/Direct/main.cpp) +# add_armral_test(vec_dot_16 test/BasicMathFun/VectorDotProd/VecDot16/main.cpp) +# add_armral_test(vec_dot_16_2 +# test/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp) +# add_armral_test(vec_dot_16_2_32_bit +# test/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp) +# add_armral_test(vec_dot_16_32_bit +# test/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp) +# add_armral_test(vec_dot_32 test/BasicMathFun/VectorDotProd/VecDot32/main.cpp) +# add_armral_test(vec_dot_32_2 +# test/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp) +# add_armral_test(vec_mul_16 test/BasicMathFun/VectorMult/VecMul16/main.cpp) +# add_armral_test(vec_mul_16_2 test/BasicMathFun/VectorMult/VecMul16_2/main.cpp) +# add_armral_test(vec_mul_32 test/BasicMathFun/VectorMult/VecMul32/main.cpp) +# add_armral_test(vec_mul_32_2 test/BasicMathFun/VectorMult/VecMul32_2/main.cpp) +# add_armral_test(mu_law_compression +# test/DuRuInterface/MuLaw/Compression/main.cpp) +# add_armral_test(mu_law_decompression +# test/DuRuInterface/MuLaw/Decompression/main.cpp) +# add_armral_test(block_float_compression +# test/DuRuInterface/ORanBlockFloat/Compression/main.cpp) +# add_armral_test(block_float_decompression +# test/DuRuInterface/ORanBlockFloat/Decompression/main.cpp) +# add_armral_test(block_scaling_compression +# test/DuRuInterface/ORanBlockScaling/Compression/main.cpp) +# add_armral_test(block_scaling_decompression +# test/DuRuInterface/ORanBlockScaling/Decompression/main.cpp) +# add_armral_test(correlation test/LowerPHY/Correlation/main.cpp) +# add_armral_test(fft_cs16 test/LowerPHY/FFT/FFT16/main.cpp) +# add_armral_test(fft_cf32 test/LowerPHY/FFT/FFT32/main.cpp) +# add_armral_test(arm_fir_filter_cs16 test/LowerPHY/FIR/FIR16/main.cpp) +# add_armral_test(arm_fir_filter_cs16_decimate_2 +# test/LowerPHY/FIR/FIR16Decimate2/main.cpp) +# add_armral_test(arm_fir_filter_cf32 test/LowerPHY/FIR/FIR32/main.cpp) +# add_armral_test(arm_fir_filter_cf32_decimate_2 +# test/LowerPHY/FIR/FIR32Decimate2/main.cpp) +# add_armral_test(scrambling test/LowerPHY/Scrambling/main.cpp) +# add_armral_test(seq_generator test/LowerPHY/SeqGenerator/main.cpp) + add_armral_test(crc test/UpperPHY/CRC/main.cpp) +# add_armral_test(tail_biting_convolutional_decoding +# test/UpperPHY/ConvolutionalDecoder/main.cpp) +# add_armral_test(tail_biting_convolutional_encoding +# test/UpperPHY/ConvolutionalEncoder/main.cpp) +# add_armral_test(demodulation test/UpperPHY/Demodulation/main.cpp) +# add_armral_test(ldpc_decoding test/UpperPHY/LDPC/Decoding/main.cpp) +# add_armral_test(ldpc_encoding test/UpperPHY/LDPC/Encoding/main.cpp) +# add_armral_test(ldpc_rate_matching test/UpperPHY/LDPC/RateMatching/main.cpp) +# add_armral_test(ldpc_rate_recovery test/UpperPHY/LDPC/RateRecovery/main.cpp) +# add_armral_test(modulation test/UpperPHY/Modulation/main.cpp) +# add_armral_test(polar_crc_attachment +# test/UpperPHY/Polar/CrcAttachment/main.cpp) +# add_armral_test(polar_decoder test/UpperPHY/Polar/Decoding/main.cpp) +# add_armral_test(polar_encoder test/UpperPHY/Polar/Encoding/main.cpp) +# add_armral_test(polar_frozen_mask test/UpperPHY/Polar/Frozen/main.cpp) +# add_armral_test(polar_rate_matching test/UpperPHY/Polar/RateMatching/main.cpp) +# add_armral_test(polar_rate_recovery test/UpperPHY/Polar/RateRecovery/main.cpp) +# add_armral_test(polar_subchannel_deinterleave +# test/UpperPHY/Polar/SubchannelDeinterleave/main.cpp) +# add_armral_test(polar_subchannel_interleave +# test/UpperPHY/Polar/SubchannelInterleave/main.cpp) +# add_armral_test(turbo_decoding test/UpperPHY/Turbo/Decoding/main.cpp) +# add_armral_test(turbo_encoding test/UpperPHY/Turbo/Encoding/main.cpp) +# add_armral_test(turbo_perm_indices test/UpperPHY/Turbo/PermIndices/main.cpp) +# add_armral_test(turbo_rate_matching test/UpperPHY/Turbo/RateMatching/main.cpp) +# add_armral_test(turbo_rate_recovery test/UpperPHY/Turbo/RateRecovery/main.cpp) +# add_armral_test(svd test/MatrixFactorizations/SVD/main.cpp) +# +# add_armral_bench( +# matrix_inv_batch_general +# bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/NonPA/main.cpp) +# add_armral_bench(matrix_inv_batch_general_pa +# bench/BasicMathFun/MatrixInv/Batch/GeneralMatInv/PA/main.cpp) +# add_armral_bench( +# matrix_inv_batch_hermitian +# bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/NonPA/main.cpp) +# add_armral_bench( +# matrix_inv_batch_hermitian_pa +# bench/BasicMathFun/MatrixInv/Batch/HermitianMatInv/PA/main.cpp) +# add_armral_bench(matrix_inv_single_general +# bench/BasicMathFun/MatrixInv/Single/GeneralMatInv/main.cpp) +# add_armral_bench(matrix_inv_single_hermitian +# bench/BasicMathFun/MatrixInv/Single/HermitianMatInv/main.cpp) +# add_armral_bench(arm_solve_1x2 +# bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x2/main.cpp) +# add_armral_bench(arm_solve_1x4 +# bench/BasicMathFun/MatrixMult/Batch/ArmSolve/1x4/main.cpp) +# add_armral_bench(arm_solve_2x2 +# bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x2/main.cpp) +# add_armral_bench(arm_solve_2x4 +# bench/BasicMathFun/MatrixMult/Batch/ArmSolve/2x4/main.cpp) +# add_armral_bench(arm_solve_4x4 +# bench/BasicMathFun/MatrixMult/Batch/ArmSolve/4x4/main.cpp) +# add_armral_bench( +# matrix_vector_mult_batch_i16_32b +# bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/NonPA/main.cpp) +# add_armral_bench( +# matrix_vector_mult_batch_i16_32b_pa +# bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/32b/PA/main.cpp) +# add_armral_bench( +# matrix_vector_mult_batch_i16_64b +# bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/NonPA/main.cpp) +# add_armral_bench( +# matrix_vector_mult_batch_i16_64b_pa +# bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult16/64b/PA/main.cpp) +# add_armral_bench( +# matrix_vector_mult_batch_f32 +# bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/NonPA/main.cpp) +# add_armral_bench( +# matrix_vector_mult_batch_f32_pa +# bench/BasicMathFun/MatrixMult/Batch/MatrixVectorMult32/PA/main.cpp) +# add_armral_bench( +# matrix_mult_i16_32b +# bench/BasicMathFun/MatrixMult/Single/MatrixMult16/32b/main.cpp) +# add_armral_bench( +# matrix_mult_i16_64b +# bench/BasicMathFun/MatrixMult/Single/MatrixMult16/64b/main.cpp) +# add_armral_bench( +# matrix_mult_f32_2x2_iq +# bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/IQ/main.cpp) +# add_armral_bench( +# matrix_mult_f32_2x2 +# bench/BasicMathFun/MatrixMult/Single/MatrixMult32/2x2/NonIQ/main.cpp) +# add_armral_bench( +# matrix_mult_f32_4x4_iq +# bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/IQ/main.cpp) +# add_armral_bench( +# matrix_mult_f32_4x4 +# bench/BasicMathFun/MatrixMult/Single/MatrixMult32/4x4/NonIQ/main.cpp) +# add_armral_bench( +# matmul_f32_general +# bench/BasicMathFun/MatrixMult/Single/MatrixMult32/general/main.cpp) +# add_armral_bench( +# matrix_mult_aah_32 +# bench/BasicMathFun/MatrixMult/Single/MatrixMultAAH32/main.cpp) +# add_armral_bench( +# matrix_mult_ahb_32 +# bench/BasicMathFun/MatrixMult/Single/MatrixMultAHB32/main.cpp) +# add_armral_bench( +# matrix_vector_mult_i16_32b +# bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/32bit/main.cpp) +# add_armral_bench( +# matrix_vector_mult_i16_64b +# bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult16/64bit/main.cpp) +# add_armral_bench( +# matrix_vector_mult_32 +# bench/BasicMathFun/MatrixMult/Single/MatrixVectorMult32/main.cpp) +# add_armral_bench(matrix_pseudo_inv_direct +# bench/BasicMathFun/MatrixPseudoInv/Direct/main.cpp) +# add_armral_bench(vec_dot_16 +# bench/BasicMathFun/VectorDotProd/VecDot16/main.cpp) +# add_armral_bench(vec_dot_16_2 +# bench/BasicMathFun/VectorDotProd/VecDot16_2/main.cpp) +# add_armral_bench(vec_dot_16_2_32_bit +# bench/BasicMathFun/VectorDotProd/VecDot16_2_32bit/main.cpp) +# add_armral_bench(vec_dot_16_32_bit +# bench/BasicMathFun/VectorDotProd/VecDot16_32bit/main.cpp) +# add_armral_bench(vec_dot_32 +# bench/BasicMathFun/VectorDotProd/VecDot32/main.cpp) +# add_armral_bench(vec_dot_32_2 +# bench/BasicMathFun/VectorDotProd/VecDot32_2/main.cpp) +# add_armral_bench(vec_mul_16 bench/BasicMathFun/VectorMult/VecMul16/main.cpp) +# add_armral_bench(vec_mul_16_2 +# bench/BasicMathFun/VectorMult/VecMul16_2/main.cpp) +# add_armral_bench(vec_mul_32 bench/BasicMathFun/VectorMult/VecMul32/main.cpp) +# add_armral_bench(vec_mul_32_2 +# bench/BasicMathFun/VectorMult/VecMul32_2/main.cpp) +# add_armral_bench(mu_law_compression_14bit +# bench/DuRuInterface/MuLaw/Compression/14bit/main.cpp) +# add_armral_bench(mu_law_compression_8bit +# bench/DuRuInterface/MuLaw/Compression/8bit/main.cpp) +# add_armral_bench(mu_law_compression_9bit +# bench/DuRuInterface/MuLaw/Compression/9bit/main.cpp) +# add_armral_bench(mu_law_decompression_14bit +# bench/DuRuInterface/MuLaw/Decompression/14bit/main.cpp) +# add_armral_bench(mu_law_decompression_8bit +# bench/DuRuInterface/MuLaw/Decompression/8bit/main.cpp) +# add_armral_bench(mu_law_decompression_9bit +# bench/DuRuInterface/MuLaw/Decompression/9bit/main.cpp) +# add_armral_bench( +# block_float_compression_12bit +# bench/DuRuInterface/ORanBlockFloat/Compression/12bit/main.cpp) +# add_armral_bench( +# block_float_compression_14bit +# bench/DuRuInterface/ORanBlockFloat/Compression/14bit/main.cpp) +# add_armral_bench(block_float_compression_8bit +# bench/DuRuInterface/ORanBlockFloat/Compression/8bit/main.cpp) +# add_armral_bench(block_float_compression_9bit +# bench/DuRuInterface/ORanBlockFloat/Compression/9bit/main.cpp) +# add_armral_bench( +# block_float_decompression_12bit +# bench/DuRuInterface/ORanBlockFloat/Decompression/12bit/main.cpp) +# add_armral_bench( +# block_float_decompression_14bit +# bench/DuRuInterface/ORanBlockFloat/Decompression/14bit/main.cpp) +# add_armral_bench( +# block_float_decompression_8bit +# bench/DuRuInterface/ORanBlockFloat/Decompression/8bit/main.cpp) +# add_armral_bench( +# block_float_decompression_9bit +# bench/DuRuInterface/ORanBlockFloat/Decompression/9bit/main.cpp) +# add_armral_bench( +# block_scaling_compression_14bit +# bench/DuRuInterface/ORanBlockScaling/Compression/14bit/main.cpp) +# add_armral_bench( +# block_scaling_compression_8bit +# bench/DuRuInterface/ORanBlockScaling/Compression/8bit/main.cpp) +# add_armral_bench( +# block_scaling_compression_9bit +# bench/DuRuInterface/ORanBlockScaling/Compression/9bit/main.cpp) +# add_armral_bench( +# block_scaling_decompression_14bit +# bench/DuRuInterface/ORanBlockScaling/Decompression/14bit/main.cpp) +# add_armral_bench( +# block_scaling_decompression_8bit +# bench/DuRuInterface/ORanBlockScaling/Decompression/8bit/main.cpp) +# add_armral_bench( +# block_scaling_decompression_9bit +# bench/DuRuInterface/ORanBlockScaling/Decompression/9bit/main.cpp) +# add_armral_bench(correlation bench/LowerPHY/Correlation/main.cpp) +# add_armral_bench(fft_cs16 bench/LowerPHY/FFT/FFT16/main.cpp) +# add_armral_bench(fft_cf32 bench/LowerPHY/FFT/FFT32/main.cpp) +# add_armral_bench(arm_fir_filter_cs16 bench/LowerPHY/FIR/FIR16/main.cpp) +# add_armral_bench(arm_fir_filter_cs16_decimate_2 +# bench/LowerPHY/FIR/FIR16Decimate2/main.cpp) +# add_armral_bench(arm_fir_filter_cf32 bench/LowerPHY/FIR/FIR32/main.cpp) +# add_armral_bench(arm_fir_filter_cf32_decimate_2 +# bench/LowerPHY/FIR/FIR32Decimate2/main.cpp) +# add_armral_bench(scrambling bench/LowerPHY/Scrambling/main.cpp) +# add_armral_bench(seq_generator bench/LowerPHY/SeqGenerator/main.cpp) + add_armral_bench(crc_11_be bench/UpperPHY/CRC/11/BigEndian/main.cpp) + add_armral_bench(crc_11_le bench/UpperPHY/CRC/11/LittleEndian/main.cpp) + add_armral_bench(crc_16_be bench/UpperPHY/CRC/16/BigEndian/main.cpp) + add_armral_bench(crc_16_le bench/UpperPHY/CRC/16/LittleEndian/main.cpp) + add_armral_bench(crc_24a_be bench/UpperPHY/CRC/24/A/BigEndian/main.cpp) + add_armral_bench(crc_24a_le bench/UpperPHY/CRC/24/A/LittleEndian/main.cpp) + add_armral_bench(crc_24b_be bench/UpperPHY/CRC/24/B/BigEndian/main.cpp) + add_armral_bench(crc_24b_le bench/UpperPHY/CRC/24/B/LittleEndian/main.cpp) + add_armral_bench(crc_24c_be bench/UpperPHY/CRC/24/C/BigEndian/main.cpp) + add_armral_bench(crc_24c_le bench/UpperPHY/CRC/24/C/LittleEndian/main.cpp) + add_armral_bench(crc_6_be bench/UpperPHY/CRC/6/BigEndian/main.cpp) + add_armral_bench(crc_6_le bench/UpperPHY/CRC/6/LittleEndian/main.cpp) +# add_armral_bench(tail_biting_convolutional_decoding +# bench/UpperPHY/ConvolutionalDecoder/main.cpp) +# add_armral_bench(tail_biting_convolutional_encoding +# bench/UpperPHY/ConvolutionalEncoder/main.cpp) +# add_armral_bench(demodulation bench/UpperPHY/Demodulation/main.cpp) +# add_armral_bench(ldpc_decoding bench/UpperPHY/LDPC/Decoding/main.cpp) +# add_armral_bench(ldpc_encoding bench/UpperPHY/LDPC/Encoding/main.cpp) +# add_armral_bench(ldpc_rate_matching bench/UpperPHY/LDPC/RateMatching/main.cpp) +# add_armral_bench(ldpc_rate_recovery bench/UpperPHY/LDPC/RateRecovery/main.cpp) +# add_armral_bench(modulation bench/UpperPHY/Modulation/main.cpp) +# add_armral_bench(polar_decoder bench/UpperPHY/Polar/Decoding/main.cpp) +# add_armral_bench(polar_encoder bench/UpperPHY/Polar/Encoding/main.cpp) +# add_armral_bench(polar_frozen_mask bench/UpperPHY/Polar/Frozen/main.cpp) +# add_armral_bench(polar_rate_matching +# bench/UpperPHY/Polar/RateMatching/main.cpp) +# add_armral_bench(polar_rate_recovery +# bench/UpperPHY/Polar/RateRecovery/main.cpp) +# add_armral_bench(polar_subchannel_deinterleave +# bench/UpperPHY/Polar/SubchannelDeinterleave/main.cpp) +# add_armral_bench(polar_subchannel_interleave +# bench/UpperPHY/Polar/SubchannelInterleave/main.cpp) +# add_armral_bench(turbo_decoding bench/UpperPHY/Turbo/Decoding/main.cpp) +# add_armral_bench(turbo_encoding bench/UpperPHY/Turbo/Encoding/main.cpp) +# add_armral_bench(turbo_rate_matching +# bench/UpperPHY/Turbo/RateMatching/main.cpp) +# add_armral_bench(turbo_rate_recovery +# bench/UpperPHY/Turbo/RateRecovery/main.cpp) +# add_armral_bench(svd bench/MatrixFactorizations/SVD/main.cpp) +# cmake-format: on +endif() + +if(BUILD_EXAMPLES) + add_custom_target(make_examples_dir ALL COMMAND ${CMAKE_COMMAND} -E + make_directory examples) + add_custom_target(examples) + add_custom_target(run_examples) + add_dependencies(run_examples examples) + + # Any parameters after the first one will be passed as parameters to the + # example executable when running it + function(add_armral_example EXAMPLE_SOURCE) + get_filename_component(EXAMPLE_EXE ${EXAMPLE_SOURCE} NAME_WE) + add_executable(${EXAMPLE_EXE} ${EXAMPLE_SOURCE}) + add_dependencies(${EXAMPLE_EXE} make_examples_dir) + set(EXAMPLE_OUTPUT_NAME examples/${EXAMPLE_EXE}) + set_target_properties(${EXAMPLE_EXE} PROPERTIES OUTPUT_NAME + ${EXAMPLE_OUTPUT_NAME}) + + target_link_libraries(${EXAMPLE_EXE} armral m) + + add_custom_target( + run_${EXAMPLE_EXE} + COMMAND ${EXAMPLE_OUTPUT_NAME} ${ARGN} + DEPENDS ${EXAMPLE_EXE}) + add_dependencies(examples ${EXAMPLE_EXE}) + add_dependencies(run_examples run_${EXAMPLE_EXE}) + endfunction() + + # cmake-format: off +# add_armral_example(examples/block_float_9b_example.c) +# add_armral_example(examples/fft_cf32_example.c 10) +# add_armral_example(examples/modulation_example.c) +# add_armral_example(examples/polar_example.cpp 128 100 35) +# cmake-format: on +endif() + +# if(BUILD_SIMULATION) # Include simulation rules and targets This involves +# building dependencies # like AWGN library and OpenMP +# add_subdirectory(simulation) endif() + +find_package(Doxygen) +if(DOXYGEN_FOUND) + set(DOXYGEN_IN ${CMAKE_CURRENT_SOURCE_DIR}/Doxyfile.in) + set(DOXYGEN_OUT ${CMAKE_CURRENT_BINARY_DIR}/Doxyfile) + configure_file(${DOXYGEN_IN} ${DOXYGEN_OUT} @ONLY) + add_custom_target(docs COMMAND ${DOXYGEN_EXECUTABLE} ${DOXYGEN_OUT}) + install( + DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/docs/html + DESTINATION ${CMAKE_INSTALL_DOCDIR} + OPTIONAL) +endif() + +# Create target to uninstall the library +if(NOT TARGET uninstall) + configure_file( + "${CMAKE_CURRENT_SOURCE_DIR}/cmake_uninstall.cmake.in" + "${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake" IMMEDIATE @ONLY) + + add_custom_target( + uninstall COMMAND ${CMAKE_COMMAND} -P + ${CMAKE_CURRENT_BINARY_DIR}/cmake_uninstall.cmake) +endif() + +# Check that the C and C++ compilers are from the same toolchain +if(NOT CMAKE_C_COMPILER_ID STREQUAL CMAKE_CXX_COMPILER_ID) + message( + FATAL_ERROR + "CXX and C compiler providers differ. Please specify the same compiler toolchain" + ) +endif() + +set(COMP_ERR_MSG + "Compilation is only supported with GNU versions 7, 8, 9, 10, 11, 12, 13, 14. \ + If compilation fails please use one of the supported compilers." +) +if(CMAKE_C_COMPILER_ID STREQUAL "GNU") + if(CMAKE_C_COMPILER_VERSION VERSION_LESS 7.1 OR CMAKE_C_COMPILER_VERSION + VERSION_GREATER 14.2) + message(WARNING ${COMP_ERR_MSG}) + endif() +else() + message(WARNING ${COMP_ERR_MSG}) +endif() diff --git a/highway b/highway new file mode 160000 index 0000000000000000000000000000000000000000..54731f560d036db8e50b96eefad258bd0b35d50c --- /dev/null +++ b/highway @@ -0,0 +1 @@ +Subproject commit 54731f560d036db8e50b96eefad258bd0b35d50c diff --git a/include/armral.h b/include/armral.h index c495fdec8a169ce30d2a7b2fa9da04a7533a8828..9a3b058dc234c9907ed29fc5d930688616a25e39 100644 --- a/include/armral.h +++ b/include/armral.h @@ -77,6 +77,7 @@ * formats. */ +#ifndef ARMRAL_ARCH_HWY // GCC sometimes complains about use of uninitialized values in arm_neon.h. // nothing we can do about that, so ignore it! #ifndef __clang__ @@ -90,6 +91,27 @@ // Restore original warning flags. #ifndef __clang__ #pragma GCC diagnostic pop +#endif +#else +// GCC sometimes complains about declaration shadowing members in arm_neon-inl.h. +// nothing we can do about that, so ignore it! +#ifndef __clang__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wshadow" +#endif + +#include "hwy/highway.h" + +// Restore original warning flags. +#ifndef __clang__ +#pragma GCC diagnostic pop +#endif + +#if !HWY_ARCH_ARM +using float32_t = float; +using float64_t = double; +#endif + #endif #include diff --git a/src/UpperPHY/CRC/crc_common.hpp b/src/UpperPHY/CRC/crc_common.hpp index 47bf69e47039d1a596c204e2e02b50abba1662a1..e97b46678d6288135c9c597f887d03fe7193106f 100644 --- a/src/UpperPHY/CRC/crc_common.hpp +++ b/src/UpperPHY/CRC/crc_common.hpp @@ -2,8 +2,13 @@ Arm RAN Acceleration Library SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates */ + #pragma once +#ifdef ARMRAL_ARCH_HWY +#include "highway/crc_common.hpp" +#else + #include static inline poly128_t vmull_force_low_p64(poly64x2_t a, poly64x2_t b) { @@ -269,3 +274,5 @@ crc64(uint32_t size, const uint64_t *input, uint64_t *crc, v0x = add_p64x2(v0x, v01); *crc = (uint64_t)(v0x[0]); } + +#endif \ No newline at end of file diff --git a/src/UpperPHY/CRC/highway/crc_common.hpp b/src/UpperPHY/CRC/highway/crc_common.hpp new file mode 100644 index 0000000000000000000000000000000000000000..11c2741a9cd8f1209f3fc3addd985622c5461eaf --- /dev/null +++ b/src/UpperPHY/CRC/highway/crc_common.hpp @@ -0,0 +1,254 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates +*/ +#pragma once + +#include "utils/hwy_types.hpp" +#include + +namespace hn = hwy::HWY_NAMESPACE; + +// Allow compilation on non-arm architectures by aliasing poly64_t to an existing type +// Test if arm_neon.h has been included +#ifndef _AARCH64_NEON_H_ +using poly64_t = uint64_t; +#endif + +// Operation assumptions +// vmull_p64 = CLMulLower +// vmull_high_p64 = CLMulUpper +// vaddq_p64 = Xor +// vrev64q_u8 = Reverse8 +// vld1q_p64 = LoadU +// vld1q_dup_p64 = Load w/ single uint64, replaced by Set w/ dereferenced pointer + +template +static inline Vec_u64x2 load_p64x2(const uint64_t *p_in) { + Vec_u64x2 vec = hn::LoadU(du64x2, p_in); + if (Endianness == 'b') { + vec = hn::BitCast(du64x2, hn::Reverse8(du8x16, hn::BitCast(du8x16, vec))); + } + return vec; +} + +template +static inline Vec_u64x2 load_dup_p64(const uint64_t *p_in) { + Vec_u64x2 vec = hn::Set(du64x2, *p_in); + if (Endianness == 'b') { + vec = hn::BitCast(du64x2, hn::Reverse8(du8x16, hn::BitCast(du8x16, vec))); + } + return vec; +} + +/** + * Computes a CRC64 in big- or little-endian mode using the specified shifts + * and polynomials. This can be used for smaller polynomials by shifting + * them to a degree 64 polynomial. + * + * @tparam BarretShift the shift used when computing @c ls1_divp. + * @param[in] size number of bytes of the given buffer + * @param[in] input points to the input byte sequence + * @param[out] crc the computed CRC + * @param[in] constants the constants specific to each polynomial: + constants[0] = padding + constants[1] = (1<<128) / P_CRC - (1<<64) + constants[2:11] = [ (1<<(64*k)) mod P_CRC, + for k in [1,1,2,3,4,5,6,7,8,9] ] + */ +template +HWY_ATTR static inline __attribute__((always_inline)) void +crc64(uint32_t size, const uint64_t *input, uint64_t *crc, + const poly64_t constants[]) { + const uint64_t *p_in = input; + const uint64_t *constants_u64 = (const uint64_t *)constants; + + if (size == 8) { + // Special case for <=64 bits + Vec_u64x2 divp_p = hn::LoadU(du64x2, &constants_u64[1]); + + Vec_u64x2 v11 = load_dup_p64(p_in); + + // Barret reduction + Vec_u64x2 vb = hn::CLMulLower(v11, divp_p); + vb = hn::Xor(vb, v11); + Vec_u64x2 v0x = hn::CLMulUpper(vb, divp_p); + *crc = hn::GetLane(v0x); + return; + } + + // Load constants for size = 16 + Vec_u64x2 lsamodp_divp = hn::LoadU(du64x2, &constants_u64[0]); + Vec_u64x2 ls11modp = hn::LoadU(du64x2, &constants_u64[2]); + Vec_u64x2 ls23modp = hn::LoadU(du64x2, &constants_u64[4]); + + if (size == 16) { + Vec_u64x2 v21 = load_p64x2(p_in); + Vec_u64x2 v01 = hn::CLMulLower(v21, ls23modp); + Vec_u64x2 vx1 = hn::Xor(v01, v21); + + // Barret reduction + Vec_u64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp); + vb = hn::Xor(vb, vx1); + Vec_u64x2 v0x = hn::CLMulUpper(vb, ls11modp); + v0x = hn::Xor(v0x, v01); + *crc = hn::GetLane(v0x); + return; + } + + // Load the rest of the constants + Vec_u64x2 ls45modp = hn::LoadU(du64x2, &constants_u64[6]); + Vec_u64x2 ls67modp = hn::LoadU(du64x2, &constants_u64[8]); + Vec_u64x2 ls89modp = hn::LoadU(du64x2, &constants_u64[10]); + + if (size == 32) { + Vec_u64x2 v43a = load_p64x2(p_in); + Vec_u64x2 v19 = load_p64x2(p_in + 2); + Vec_u64x2 v01e = hn::CLMulLower(v43a, ls45modp); + Vec_u64x2 v01a = hn::CLMulUpper(v43a, ls23modp); + Vec_u64x2 v01 = hn::Xor(v01a, v01e); + v01a = hn::CLMulLower(v19, ls23modp); + v01 = hn::Xor(v01, v01a); + Vec_u64x2 vx1 = hn::Xor(v01, v19); + + // Barret reduction + Vec_u64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp); + vb = hn::Xor(vb, vx1); + Vec_u64x2 v0x = hn::CLMulUpper(vb, ls11modp); + v0x = hn::Xor(v0x, v01); + *crc = hn::GetLane(v0x); + return; + } + + // remainder of the division by 64 byte == 512 bit, i.e. 4 vectors of 128 bit + uint32_t init_bytes = size % 64; + const uint64_t *p_end = p_in + (size - 16) / 8; + + // These values are carried forwards to the next loop iteration each time. + Vec_u64x2 v01; + + if (init_bytes == 16) { + v01 = hn::Zero(du64x2); + p_in += 8; + } else if (init_bytes == 32) { + Vec_u64x2 v43 = load_p64x2(p_in); + p_in += 10; + Vec_u64x2 v01e = hn::CLMulLower(v43, ls45modp); + Vec_u64x2 v01a = hn::CLMulUpper(v43, ls23modp); + v01 = hn::Xor(v01a, v01e); + } else if (init_bytes == 48) { + Vec_u64x2 v65 = load_p64x2(p_in); + Vec_u64x2 v43 = load_p64x2(p_in + 2); + p_in += 12; + Vec_u64x2 v01g = hn::CLMulLower(v65, ls67modp); + Vec_u64x2 v01e = hn::CLMulUpper(v65, ls45modp); + Vec_u64x2 v01c = hn::CLMulLower(v43, ls45modp); + Vec_u64x2 v01a = hn::CLMulUpper(v43, ls23modp); + v01e = hn::Xor(v01e, v01g); + v01a = hn::Xor(v01a, v01c); + v01 = hn::Xor(v01a, v01e); + + } else { + Vec_u64x2 v87 = load_p64x2(p_in); + Vec_u64x2 v65 = load_p64x2(p_in + 2); + Vec_u64x2 v43 = load_p64x2(p_in + 4); + p_in += 14; + Vec_u64x2 v01d = hn::CLMulLower(v87, ls89modp); + Vec_u64x2 v01c = hn::CLMulUpper(v87, ls67modp); + Vec_u64x2 v01b = hn::CLMulLower(v65, ls67modp); + Vec_u64x2 v01a = hn::CLMulUpper(v65, ls45modp); + Vec_u64x2 v01g = hn::CLMulLower(v43, ls45modp); + Vec_u64x2 v01e = hn::CLMulUpper(v43, ls23modp); + v01c = hn::Xor(v01c, v01d); + v01a = hn::Xor(v01a, v01b); + v01e = hn::Xor(v01e, v01g); + v01a = hn::Xor(v01a, v01c); + v01 = hn::Xor(v01a, v01e); + } + + Vec_u64x2 v19 = load_p64x2(p_in - 8); + + if (size <= 64) { + Vec_u64x2 v01a = hn::CLMulLower(v19, ls23modp); + v01 = hn::Xor(v01, v01a); + Vec_u64x2 vx1 = hn::Xor(v01, v19); + + // Barret reduction + Vec_u64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp); + vb = hn::Xor(vb, vx1); + Vec_u64x2 v0x = hn::CLMulUpper(vb, ls11modp); + v0x = hn::Xor(v0x, v01); + *crc = hn::GetLane(v0x); + return; + } + + Vec_u64x2 v87 = load_p64x2(p_in - 6); + Vec_u64x2 v65 = load_p64x2(p_in - 4); + Vec_u64x2 v43 = load_p64x2(p_in - 2); + + while (p_in < p_end) { + Vec_u64x2 v01bb = hn::CLMulLower(v19, lsamodp_divp); + Vec_u64x2 v01b = hn::CLMulUpper(v87, ls67modp); + Vec_u64x2 vx9 = hn::Xor(v01, v19); + Vec_u64x2 v8x = hn::Xor(v87, v01); + + v19 = load_p64x2(p_in); + v87 = load_p64x2(p_in + 2); + + Vec_u64x2 v01g = hn::CLMulUpper(vx9, ls89modp); + Vec_u64x2 v01e = hn::CLMulLower(v8x, ls89modp); + + v01b = hn::Xor(v01b, v01bb); + + Vec_u64x2 v01aa = hn::CLMulLower(v65, ls67modp); + Vec_u64x2 v01a = hn::CLMulUpper(v65, ls45modp); + Vec_u64x2 v01d = hn::CLMulLower(v43, ls45modp); + Vec_u64x2 v01c = hn::CLMulUpper(v43, ls23modp); + + v65 = load_p64x2(p_in + 4); + v43 = load_p64x2(p_in + 6); + p_in += 8; + + v01a = hn::Xor(v01a, v01aa); + v01c = hn::Xor(v01c, v01d); + v01a = hn::Xor(v01a, v01b); + v01e = hn::Xor(v01e, v01g); + v01a = hn::Xor(v01a, v01c); + v01 = hn::Xor(v01a, v01e); + } + + Vec_u64x2 v21 = load_p64x2(p_in); + + Vec_u64x2 v01ff = hn::CLMulLower(v19, lsamodp_divp); + Vec_u64x2 v01f = hn::CLMulUpper(v87, ls67modp); + Vec_u64x2 vx9 = hn::Xor(v01, v19); + Vec_u64x2 v8x = hn::Xor(v87, v01); + + Vec_u64x2 v01ee = hn::CLMulUpper(vx9, ls89modp); + Vec_u64x2 v01e = hn::CLMulLower(v8x, ls89modp); + + v01f = hn::Xor(v01f, v01ff); + v01e = hn::Xor(v01e, v01ee); + v01e = hn::Xor(v01e, v01f); + + Vec_u64x2 v01d = hn::CLMulLower(v65, ls67modp); + Vec_u64x2 v01c = hn::CLMulUpper(v65, ls45modp); + Vec_u64x2 v01b = hn::CLMulLower(v43, ls45modp); + Vec_u64x2 v01a = hn::CLMulUpper(v43, ls23modp); + Vec_u64x2 v01g = hn::CLMulLower(v21, ls23modp); + + v01c = hn::Xor(v01c, v01d); + v01a = hn::Xor(v01a, v01b); + v01e = hn::Xor(v01e, v01g); + v01a = hn::Xor(v01a, v01c); + v01 = hn::Xor(v01a, v01e); + + Vec_u64x2 vx1 = hn::Xor(v01, v21); + + // Barret reduction + Vec_u64x2 vb = hn::CLMulUpper(vx1, lsamodp_divp); + vb = hn::Xor(vb, vx1); + Vec_u64x2 v0x = hn::CLMulUpper(vb, ls11modp); + v0x = hn::Xor(v0x, v01); + *crc = hn::GetLane(v0x); +} diff --git a/src/utils/hwy_types.hpp b/src/utils/hwy_types.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ddbe35d15180848871df120996f9757394ea5e03 --- /dev/null +++ b/src/utils/hwy_types.hpp @@ -0,0 +1,88 @@ +/* + Arm RAN Acceleration Library + SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates + Copyright (C) COPYRIGHT Cambridge Consultants Ltd 2023-2025 + Cambridge Consultants Project Reference P5851 +*/ + +/* +Defines base vector types for developing using Google Highway for +vector instructions. If there is a common vector type which is used +across files include it here. Otherwise, for specific vector types, +such as a hn::Rebind should be included locally in source files. +*/ + +#pragma once + +#include +namespace hn = hwy::HWY_NAMESPACE; + +/* +Full128 Vector Types. In general, use these where there is cross-lane +logic is used (e.g. a pairwise add) or where data-chunking is heavily +tied to 128-bit blocks. +*/ + +// Vector Tags +const hn::Full128 du8x16; +const hn::Full128 di8x16; +const hn::Full128 du16x8; +const hn::Full128 di16x8; +const hn::Full128 du32x4; +const hn::Full128 di32x4; +const hn::Full128 du64x2; +const hn::Full128 di64x2; + +// Vector Types +using Vec_u8x16 = hn::Vec; +using Vec_i8x16 = hn::Vec; +using Vec_u16x8 = hn::Vec; +using Vec_i16x8 = hn::Vec; +using Vec_u32x4 = hn::Vec; +using Vec_i32x4 = hn::Vec; +using Vec_u64x2 = hn::Vec; +using Vec_i64x2 = hn::Vec; + +// Rebind Tags +/* e.g. const hn::Rebind di16x8_di8x16; +where the first tag named in the rebind tag is the old type +which the rebind tag is created from and the second is the +new tag type. These are used in operations where output vector +width is different from that of the input. */ + +/* +Scalable vector types. The default choice should be to use +these vector types since it allows for processing of more +data for wider vector widths. Use Full128 for the reasons +listed above. + +Note lack of quantity of vector elements - this is variable. +Use hn::Lanes(vector_tag) to stride by the correct size when +looping over data. +*/ + +// Vector Tags +const hn::ScalableTag du8; +const hn::ScalableTag di8; +const hn::ScalableTag du16; +const hn::ScalableTag di16; +const hn::ScalableTag du32; +const hn::ScalableTag di32; +const hn::ScalableTag du64; +const hn::ScalableTag di64; + +// Vector Types +using Vec_u8 = hn::Vec; +using Vec_i8 = hn::Vec; +using Vec_u16 = hn::Vec; +using Vec_i16 = hn::Vec; +using Vec_u32 = hn::Vec; +using Vec_i32 = hn::Vec; +using Vec_u64 = hn::Vec; +using Vec_i64 = hn::Vec; + +// Rebind Tags +/* e.g. const hn::Rebind di16_di8; +where the first tag named in the rebind tag is the old type +which the rebind tag is created from and the second is the +new tag type. */ \ No newline at end of file diff --git a/utils/rng.cpp b/utils/rng.cpp index 33887ee326941cf279626148157881ba33c971c1..2385df7d2f72bb662009f0768184a27607ed3f8a 100644 --- a/utils/rng.cpp +++ b/utils/rng.cpp @@ -2,7 +2,12 @@ Arm RAN Acceleration Library SPDX-FileCopyrightText: Copyright 2020-2024 Arm Limited and/or its affiliates */ +#if defined(ARMRAL_ARCH_HWY) && !HWY_ARCH_ARM +using float32_t = float; +using float64_t = double; +#else #include +#endif #include "rng.hpp"