From c577f2c6a3b4ddb6ba87a882723c53a248afbeba Mon Sep 17 00:00:00 2001 From: telsoa01 Date: Fri, 31 Aug 2018 09:22:23 +0100 Subject: [PATCH] Release 18.08 --- Android.mk | 86 +- BuildGuideAndroidNDK.md | 14 +- BuildGuideCrossCompilation.md | 265 ++ CMakeLists.txt | 218 +- README.md | 12 +- cmake/GlobalConfig.cmake | 89 +- include/armnn/ArmNN.hpp | 1 + include/armnn/Descriptors.hpp | 32 +- include/armnn/DescriptorsFwd.hpp | 1 + include/armnn/Exceptions.hpp | 37 +- include/armnn/INetwork.hpp | 217 +- include/armnn/IProfiler.hpp | 38 + include/armnn/IRuntime.hpp | 77 +- include/armnn/LayerSupport.hpp | 45 +- include/armnn/LstmParams.hpp | 55 + include/armnn/NetworkFwd.hpp | 3 +- include/armnn/Tensor.hpp | 22 +- include/armnn/Types.hpp | 46 +- include/armnn/TypesUtils.hpp | 133 +- include/armnn/Utils.hpp | 5 +- include/armnn/Version.hpp | 2 +- include/armnnCaffeParser/ICaffeParser.hpp | 10 +- include/armnnOnnxParser/IOnnxParser.hpp | 48 + include/armnnTfLiteParser/ITfLiteParser.hpp | 61 + include/armnnTfParser/ITfParser.hpp | 12 +- samples/CMakeLists.txt | 4 + samples/SimpleSample.cpp | 68 + src/armnn/Descriptors.cpp | 2 +- src/armnn/DeviceSpec.hpp | 22 + src/armnn/Graph.cpp | 111 +- src/armnn/Graph.hpp | 87 +- src/armnn/Half.hpp | 35 + src/armnn/IGraphObservable.hpp | 28 + src/armnn/Instrument.hpp | 66 + src/armnn/InternalTypes.cpp | 3 + src/armnn/InternalTypes.hpp | 5 +- src/armnn/JsonPrinter.cpp | 134 + src/armnn/JsonPrinter.hpp | 82 + src/armnn/Layer.cpp | 88 +- src/armnn/Layer.hpp | 70 +- src/armnn/LayerSupport.cpp | 88 +- src/armnn/LayerSupportCommon.hpp | 59 +- src/armnn/LayersFwd.hpp | 6 + src/armnn/LoadedNetwork.cpp | 91 +- src/armnn/LoadedNetwork.hpp | 11 +- src/armnn/NeonInterceptorScheduler.cpp | 57 + src/armnn/NeonInterceptorScheduler.hpp | 37 + src/armnn/NeonTimer.cpp | 56 + src/armnn/NeonTimer.hpp | 43 + src/armnn/Network.cpp | 339 +- src/armnn/Network.hpp | 7 +- src/armnn/NetworkUtils.hpp | 79 + src/armnn/Observable.cpp | 36 + src/armnn/Observable.hpp | 67 + src/armnn/OpenClTimer.cpp | 105 + src/armnn/OpenClTimer.hpp | 59 + src/armnn/Optimizer.cpp | 49 +- src/armnn/Optimizer.hpp | 33 +- src/armnn/Profiling.cpp | 455 ++- src/armnn/Profiling.hpp | 179 +- src/armnn/ProfilingEvent.cpp | 103 + src/armnn/ProfilingEvent.hpp | 92 + src/armnn/Runtime.cpp | 79 +- src/armnn/Runtime.hpp | 32 +- src/armnn/Tensor.cpp | 2 +- src/armnn/TypeUtils.hpp | 40 + src/armnn/Utils.cpp | 4 +- src/armnn/WallClockTimer.cpp | 41 + src/armnn/WallClockTimer.hpp | 63 + src/armnn/backends/AclBaseMemoryManager.cpp | 32 - src/armnn/backends/AclBaseMemoryManager.hpp | 46 - src/armnn/backends/ArmComputeTensorUtils.cpp | 29 +- src/armnn/backends/ArmComputeTensorUtils.hpp | 97 +- src/armnn/backends/ArmComputeUtils.hpp | 12 +- src/armnn/backends/ClContextControl.cpp | 61 +- src/armnn/backends/ClContextControl.hpp | 14 +- src/armnn/backends/ClLayerSupport.cpp | 222 +- src/armnn/backends/ClLayerSupport.hpp | 39 +- src/armnn/backends/ClTensorHandle.hpp | 84 +- src/armnn/backends/ClWorkloadFactory.cpp | 110 +- src/armnn/backends/ClWorkloadFactory.hpp | 29 +- src/armnn/backends/ClWorkloadUtils.hpp | 30 +- src/armnn/backends/ClWorkloads.hpp | 6 +- .../ClActivationFloat32Workload.cpp | 25 +- .../ClActivationFloat32Workload.hpp | 7 +- .../ClWorkloads/ClActivationUint8Workload.cpp | 14 +- .../ClWorkloads/ClActivationUint8Workload.hpp | 2 +- .../ClWorkloads/ClAdditionBaseWorkload.cpp | 71 + .../ClWorkloads/ClAdditionBaseWorkload.hpp | 29 + .../ClWorkloads/ClAdditionFloat32Workload.cpp | 41 +- .../ClWorkloads/ClAdditionFloat32Workload.hpp | 18 +- .../ClWorkloads/ClAdditionUint8Workload.cpp | 18 + .../ClWorkloads/ClAdditionUint8Workload.hpp | 20 + .../ClWorkloads/ClBaseConstantWorkload.cpp | 20 +- .../ClWorkloads/ClBaseConstantWorkload.hpp | 6 +- .../ClWorkloads/ClBaseMergerWorkload.hpp | 10 +- .../ClWorkloads/ClBaseSplitterWorkload.hpp | 10 +- .../ClBatchNormalizationFloat32Workload.cpp | 74 +- .../ClBatchNormalizationFloat32Workload.hpp | 22 +- .../ClWorkloads/ClConstantFloat32Workload.cpp | 2 +- .../ClWorkloads/ClConstantFloat32Workload.hpp | 4 +- .../ClWorkloads/ClConstantUint8Workload.cpp | 2 +- .../ClConvertFp16ToFp32Workload.cpp | 64 + .../ClConvertFp16ToFp32Workload.hpp | 28 + .../ClConvertFp32ToFp16Workload.cpp | 64 + .../ClConvertFp32ToFp16Workload.hpp | 28 + .../ClConvolution2dFloat32Workload.cpp | 36 +- .../ClConvolution2dFloat32Workload.hpp | 10 +- .../ClConvolution2dUint8Workload.cpp | 33 +- .../ClConvolution2dUint8Workload.hpp | 8 +- .../ClDepthwiseConvolutionBaseWorkload.cpp | 122 + .../ClDepthwiseConvolutionBaseWorkload.hpp | 37 + .../ClDepthwiseConvolutionFloat32Workload.cpp | 22 +- .../ClDepthwiseConvolutionFloat32Workload.hpp | 17 +- .../ClDepthwiseConvolutionHelper.hpp | 91 - .../ClDepthwiseConvolutionUint8Workload.cpp | 22 +- .../ClDepthwiseConvolutionUint8Workload.hpp | 16 +- .../ClWorkloads/ClFloorFloat32Workload.cpp | 4 +- .../ClWorkloads/ClFloorFloat32Workload.hpp | 2 +- .../ClFullyConnectedFloat32Workload.cpp | 70 +- .../ClFullyConnectedFloat32Workload.hpp | 19 +- .../ClL2NormalizationFloat32Workload.cpp | 16 +- .../ClL2NormalizationFloat32Workload.hpp | 5 +- .../ClWorkloads/ClLstmFloat32Workload.cpp | 405 +++ .../ClWorkloads/ClLstmFloat32Workload.hpp | 67 + .../ClWorkloads/ClMergerFloat32Workload.cpp | 2 +- .../ClWorkloads/ClMergerFloat32Workload.hpp | 4 +- .../ClWorkloads/ClMergerUint8Workload.cpp | 2 +- .../ClMultiplicationFloat32Workload.cpp | 26 +- .../ClMultiplicationFloat32Workload.hpp | 9 +- .../ClNormalizationFloat32Workload.cpp | 4 +- .../ClNormalizationFloat32Workload.hpp | 2 +- .../ClWorkloads/ClPermuteWorkload.cpp | 16 +- .../ClWorkloads/ClPermuteWorkload.hpp | 13 +- .../ClWorkloads/ClPooling2dBaseWorkload.cpp | 10 +- .../ClWorkloads/ClPooling2dBaseWorkload.hpp | 8 +- .../ClPooling2dFloat32Workload.cpp | 4 +- .../ClPooling2dFloat32Workload.hpp | 2 +- .../ClWorkloads/ClPooling2dUint8Workload.cpp | 2 +- .../ClWorkloads/ClReshapeFloat32Workload.cpp | 4 +- .../ClWorkloads/ClReshapeFloat32Workload.hpp | 2 +- .../ClWorkloads/ClReshapeUint8Workload.cpp | 2 +- .../ClResizeBilinearFloat32Workload.cpp | 4 +- .../ClResizeBilinearFloat32Workload.hpp | 2 +- .../ClWorkloads/ClSoftmaxBaseWorkload.cpp | 28 + .../ClWorkloads/ClSoftmaxBaseWorkload.hpp | 16 + .../ClWorkloads/ClSoftmaxFloat32Workload.cpp | 4 +- .../ClWorkloads/ClSoftmaxFloat32Workload.hpp | 2 +- .../ClWorkloads/ClSoftmaxUint8Workload.cpp | 2 +- .../ClWorkloads/ClSplitterFloat32Workload.cpp | 2 +- .../ClWorkloads/ClSplitterFloat32Workload.hpp | 4 +- .../ClWorkloads/ClSplitterUint8Workload.cpp | 2 +- src/armnn/backends/CpuTensorHandle.cpp | 6 + src/armnn/backends/CpuTensorHandle.hpp | 41 +- src/armnn/backends/ITensorHandle.hpp | 48 + src/armnn/backends/MakeWorkloadHelper.hpp | 19 +- src/armnn/backends/MemCopyWorkload.cpp | 223 +- src/armnn/backends/MemCopyWorkload.hpp | 120 +- src/armnn/backends/NeonLayerSupport.cpp | 242 +- src/armnn/backends/NeonLayerSupport.hpp | 39 +- src/armnn/backends/NeonTensorHandle.hpp | 73 +- src/armnn/backends/NeonWorkloadFactory.cpp | 110 +- src/armnn/backends/NeonWorkloadFactory.hpp | 29 +- src/armnn/backends/NeonWorkloadUtils.cpp | 21 +- src/armnn/backends/NeonWorkloadUtils.hpp | 9 + src/armnn/backends/NeonWorkloads.hpp | 3 + .../NeonActivationFloat32Workload.cpp | 27 +- .../NeonActivationFloat32Workload.hpp | 7 +- .../NeonActivationUint8Workload.cpp | 13 +- .../NeonAdditionFloat32Workload.cpp | 20 +- .../NeonAdditionFloat32Workload.hpp | 7 +- .../NeonBaseConstantWorkload.hpp | 25 +- .../NeonWorkloads/NeonBaseMergerWorkload.hpp | 11 +- .../NeonBaseSplitterWorkload.hpp | 11 +- .../NeonBatchNormalizationFloat32Workload.cpp | 75 +- .../NeonBatchNormalizationFloat32Workload.hpp | 20 +- .../NeonConstantFloat32Workload.cpp | 2 +- .../NeonConstantFloat32Workload.hpp | 4 +- .../NeonConstantUint8Workload.cpp | 2 +- .../NeonConvertFp16ToFp32Workload.cpp | 41 + .../NeonConvertFp16ToFp32Workload.hpp | 26 + .../NeonConvertFp32ToFp16Workload.cpp | 43 + .../NeonConvertFp32ToFp16Workload.hpp | 26 + .../NeonConvolution2dBaseWorkload.cpp | 69 +- .../NeonConvolution2dBaseWorkload.hpp | 13 +- .../NeonConvolution2dFloat32Workload.cpp | 7 +- .../NeonConvolution2dFloat32Workload.hpp | 2 +- .../NeonConvolution2dUint8Workload.cpp | 8 +- .../NeonDepthwiseConvolutionBaseWorkload.cpp | 46 + .../NeonDepthwiseConvolutionBaseWorkload.hpp | 19 + ...eonDepthwiseConvolutionFloat32Workload.cpp | 41 +- ...eonDepthwiseConvolutionFloat32Workload.hpp | 8 +- .../NeonDepthwiseConvolutionUint8Workload.cpp | 39 +- .../NeonDepthwiseConvolutionUint8Workload.hpp | 6 +- .../NeonFloorFloat32Workload.cpp | 4 +- .../NeonFloorFloat32Workload.hpp | 2 +- .../NeonFullyConnectedFloat32Workload.cpp | 67 +- .../NeonFullyConnectedFloat32Workload.hpp | 15 +- .../NeonL2NormalizationFloat32Workload.cpp | 16 +- .../NeonL2NormalizationFloat32Workload.hpp | 5 +- .../NeonWorkloads/NeonLstmFloat32Workload.cpp | 22 + .../NeonWorkloads/NeonLstmFloat32Workload.hpp | 20 + .../NeonMergerFloat32Workload.cpp | 2 +- .../NeonMergerFloat32Workload.hpp | 4 +- .../NeonWorkloads/NeonMergerUint8Workload.cpp | 2 +- .../NeonMultiplicationFloat32Workload.cpp | 23 +- .../NeonMultiplicationFloat32Workload.hpp | 5 +- .../NeonNormalizationFloat32Workload.cpp | 23 +- .../NeonNormalizationFloat32Workload.hpp | 6 +- .../NeonWorkloads/NeonPermuteWorkload.cpp | 16 +- .../NeonWorkloads/NeonPermuteWorkload.hpp | 13 +- .../NeonPooling2dBaseWorkload.cpp | 8 +- .../NeonPooling2dBaseWorkload.hpp | 8 +- .../NeonPooling2dFloat32Workload.cpp | 5 +- .../NeonPooling2dFloat32Workload.hpp | 3 +- .../NeonPooling2dUint8Workload.cpp | 2 +- .../NeonReshapeFloat32Workload.cpp | 4 +- .../NeonReshapeFloat32Workload.hpp | 2 +- .../NeonReshapeUint8Workload.cpp | 2 +- .../NeonWorkloads/NeonSoftmaxBaseWorkload.cpp | 30 + .../NeonWorkloads/NeonSoftmaxBaseWorkload.hpp | 17 + .../NeonSoftmaxFloat32Workload.cpp | 6 +- .../NeonSoftmaxFloat32Workload.hpp | 2 +- .../NeonSoftmaxUint8Workload.cpp | 2 +- .../NeonSplitterFloat32Workload.cpp | 2 +- .../NeonSplitterFloat32Workload.hpp | 4 +- .../NeonSplitterUint8Workload.cpp | 2 +- src/armnn/backends/OutputHandler.cpp | 8 - src/armnn/backends/OutputHandler.hpp | 21 +- src/armnn/backends/RefLayerSupport.cpp | 99 +- src/armnn/backends/RefLayerSupport.hpp | 38 + src/armnn/backends/RefWorkloadFactory.cpp | 61 +- src/armnn/backends/RefWorkloadFactory.hpp | 22 +- src/armnn/backends/RefWorkloads.hpp | 3 + .../backends/RefWorkloads/Activation.cpp | 2 +- .../backends/RefWorkloads/Activation.hpp | 2 +- src/armnn/backends/RefWorkloads/Broadcast.hpp | 2 +- src/armnn/backends/RefWorkloads/ConvImpl.cpp | 2 +- src/armnn/backends/RefWorkloads/ConvImpl.hpp | 26 +- .../backends/RefWorkloads/FullyConnected.cpp | 6 +- .../backends/RefWorkloads/FullyConnected.hpp | 2 +- src/armnn/backends/RefWorkloads/Merger.hpp | 14 +- src/armnn/backends/RefWorkloads/Pooling2d.cpp | 8 +- src/armnn/backends/RefWorkloads/Pooling2d.hpp | 2 +- .../RefWorkloads/RefBaseConstantWorkload.hpp | 2 +- .../RefBatchNormalizationFloat32Workload.cpp | 15 +- .../RefBatchNormalizationFloat32Workload.hpp | 9 +- .../RefBatchNormalizationUint8Workload.cpp | 23 +- .../RefBatchNormalizationUint8Workload.hpp | 9 +- .../RefConvertFp16ToFp32Workload.cpp | 25 + .../RefConvertFp16ToFp32Workload.hpp | 21 + .../RefConvertFp32ToFp16Workload.cpp | 29 + .../RefConvertFp32ToFp16Workload.hpp | 21 + .../RefConvolution2dFloat32Workload.cpp | 13 +- .../RefConvolution2dFloat32Workload.hpp | 8 +- .../RefConvolution2dUint8Workload.cpp | 15 +- .../RefConvolution2dUint8Workload.hpp | 9 +- ...fDepthwiseConvolution2dFloat32Workload.cpp | 13 +- ...fDepthwiseConvolution2dFloat32Workload.hpp | 8 +- ...RefDepthwiseConvolution2dUint8Workload.cpp | 16 +- ...RefDepthwiseConvolution2dUint8Workload.hpp | 7 +- .../RefFullyConnectedFloat32Workload.cpp | 10 +- .../RefFullyConnectedFloat32Workload.hpp | 7 +- .../RefFullyConnectedUint8Workload.cpp | 16 +- .../RefFullyConnectedUint8Workload.hpp | 7 +- .../RefWorkloads/RefLstmFloat32Workload.cpp | 16 + .../RefWorkloads/RefLstmFloat32Workload.hpp | 21 + .../RefNormalizationFloat32Workload.cpp | 4 +- .../RefWorkloads/RefPermuteWorkload.cpp | 1 + .../RefWorkloads/RefWorkloadUtils.hpp | 13 + .../backends/RefWorkloads/ResizeBilinear.cpp | 22 +- src/armnn/backends/RefWorkloads/Softmax.cpp | 8 +- src/armnn/backends/RefWorkloads/Softmax.hpp | 2 +- src/armnn/backends/RefWorkloads/Splitter.hpp | 8 +- .../RefWorkloads/TensorBufferArrayView.hpp | 2 +- src/armnn/backends/Workload.hpp | 81 +- src/armnn/backends/WorkloadData.cpp | 69 +- src/armnn/backends/WorkloadData.hpp | 96 +- src/armnn/backends/WorkloadFactory.cpp | 418 ++- src/armnn/backends/WorkloadFactory.hpp | 23 +- src/armnn/backends/WorkloadUtils.hpp | 139 + src/armnn/backends/test/ActivationFixture.hpp | 2 +- .../backends/test/ActivationTestImpl.hpp | 27 +- src/armnn/backends/test/ArmComputeCl.cpp | 48 +- src/armnn/backends/test/ArmComputeNeon.cpp | 156 +- src/armnn/backends/test/BatchNormTestImpl.hpp | 6 +- .../backends/test/ClContextControlFixture.hpp | 21 + src/armnn/backends/test/Conv2dTestImpl.hpp | 52 +- .../test/ConvertFp16ToFp32TestImpl.hpp | 55 + .../test/ConvertFp32ToFp16TestImpl.hpp | 55 + src/armnn/backends/test/CreateWorkloadCl.cpp | 340 +- .../backends/test/CreateWorkloadNeon.cpp | 270 +- src/armnn/backends/test/CreateWorkloadRef.cpp | 219 +- .../backends/test/FullyConnectedTestImpl.hpp | 8 +- .../backends/test/IsLayerSupportedTest.cpp | 178 +- .../test/IsLayerSupportedTestImpl.hpp | 167 +- .../test/LayerReleaseConstantDataTest.cpp | 212 ++ src/armnn/backends/test/LayerTests.cpp | 166 +- src/armnn/backends/test/LayerTests.hpp | 25 +- src/armnn/backends/test/LstmTestImpl.hpp | 1150 ++++++ src/armnn/backends/test/MemCopyTests.cpp | 24 + src/armnn/backends/test/NormTestImpl.hpp | 4 +- src/armnn/backends/test/Pooling2dTestImpl.hpp | 14 +- src/armnn/backends/test/QuantizeHelper.hpp | 2 +- src/armnn/backends/test/Reference.cpp | 26 +- src/armnn/backends/test/SoftmaxTestImpl.hpp | 2 +- src/armnn/backends/test/SplitterTestImpl.hpp | 40 +- src/armnn/backends/test/TensorCopyUtils.cpp | 11 +- .../backends/test/WorkloadDataValidation.cpp | 71 +- src/armnn/layers/ActivationLayer.cpp | 8 +- src/armnn/layers/AdditionLayer.cpp | 40 +- src/armnn/layers/AdditionLayer.hpp | 2 + src/armnn/layers/BatchNormalizationLayer.cpp | 24 +- src/armnn/layers/BatchNormalizationLayer.hpp | 2 + src/armnn/layers/ConstantLayer.cpp | 18 +- src/armnn/layers/ConstantLayer.hpp | 12 +- src/armnn/layers/ConvertFp16ToFp32Layer.cpp | 48 + src/armnn/layers/ConvertFp16ToFp32Layer.hpp | 28 + src/armnn/layers/ConvertFp32ToFp16Layer.cpp | 47 + src/armnn/layers/ConvertFp32ToFp16Layer.hpp | 27 + src/armnn/layers/Convolution2dLayer.cpp | 43 +- src/armnn/layers/Convolution2dLayer.hpp | 4 + .../layers/DepthwiseConvolution2dLayer.cpp | 46 +- .../layers/DepthwiseConvolution2dLayer.hpp | 4 + src/armnn/layers/FakeQuantizationLayer.cpp | 12 +- src/armnn/layers/FloorLayer.cpp | 16 +- src/armnn/layers/FullyConnectedLayer.cpp | 40 +- src/armnn/layers/FullyConnectedLayer.hpp | 3 + src/armnn/layers/L2NormalizationLayer.cpp | 13 +- src/armnn/layers/LayerWithParameters.hpp | 6 +- src/armnn/layers/LstmLayer.cpp | 259 ++ src/armnn/layers/LstmLayer.hpp | 70 + src/armnn/layers/MemCopyLayer.cpp | 15 +- src/armnn/layers/MergerLayer.cpp | 73 +- src/armnn/layers/MergerLayer.hpp | 1 + src/armnn/layers/MultiplicationLayer.cpp | 40 +- src/armnn/layers/MultiplicationLayer.hpp | 1 + src/armnn/layers/NormalizationLayer.cpp | 10 +- src/armnn/layers/OutputLayer.cpp | 2 +- src/armnn/layers/PermuteLayer.cpp | 20 +- src/armnn/layers/PermuteLayer.hpp | 1 + src/armnn/layers/Pooling2dLayer.cpp | 30 +- src/armnn/layers/Pooling2dLayer.hpp | 9 +- src/armnn/layers/ReshapeLayer.cpp | 16 +- src/armnn/layers/ReshapeLayer.hpp | 1 + src/armnn/layers/ResizeBilinearLayer.cpp | 24 +- src/armnn/layers/ResizeBilinearLayer.hpp | 1 + src/armnn/layers/SoftmaxLayer.cpp | 10 +- src/armnn/layers/SoftmaxLayer.hpp | 8 +- src/armnn/layers/SplitterLayer.cpp | 32 +- src/armnn/layers/SplitterLayer.hpp | 1 + src/armnn/memory/BaseMemoryManager.cpp | 125 + src/armnn/memory/BaseMemoryManager.hpp | 104 + src/armnn/memory/BlobLifetimeManager.cpp | 79 + src/armnn/memory/BlobLifetimeManager.hpp | 35 + src/armnn/memory/BlobMemoryPool.cpp | 88 + src/armnn/memory/BlobMemoryPool.hpp | 55 + src/armnn/memory/IMemoryPool.hpp | 22 + src/armnn/memory/IPoolManager.hpp | 21 + src/armnn/memory/OffsetLifetimeManager.cpp | 62 + src/armnn/memory/OffsetLifetimeManager.hpp | 37 + src/armnn/memory/OffsetMemoryPool.cpp | 84 + src/armnn/memory/OffsetMemoryPool.hpp | 54 + src/armnn/memory/PoolManager.cpp | 105 + src/armnn/memory/PoolManager.hpp | 56 + src/armnn/optimizations/All.hpp | 3 + src/armnn/optimizations/ConvertConstants.hpp | 98 + .../ConvertFp32NetworkToFp16.hpp | 80 + src/armnn/optimizations/MovePermuteUp.hpp | 10 +- src/armnn/optimizations/Optimization.hpp | 7 +- .../OptimizeConsecutiveReshapes.hpp | 10 +- .../OptimizeInverseConversions.hpp | 44 + src/armnn/optimizations/PermuteAsReshape.hpp | 2 +- .../optimizations/SquashEqualSiblings.hpp | 2 +- src/armnn/test/CreateWorkload.hpp | 487 ++- src/armnn/test/CreateWorkloadClNeon.hpp | 15 +- src/armnn/test/CsvReaderTest.cpp | 124 + src/armnn/test/EndToEndTest.cpp | 158 +- src/armnn/test/FP16SupportTest.cpp | 114 + src/armnn/test/FloatingPointConverterTest.cpp | 58 + src/armnn/test/GraphTests.cpp | 119 +- src/armnn/test/InstrumentTests.cpp | 62 + src/armnn/test/JsonPrinterTests.cpp | 378 ++ src/armnn/test/NeonTimerTest.cpp | 104 + src/armnn/test/NetworkTests.cpp | 968 ++++++ src/armnn/test/Network_test.cpp | 483 --- src/armnn/test/ObservableTest.cpp | 94 + src/armnn/test/OpenClTimerTest.cpp | 137 + src/armnn/test/OptimizerTests.cpp | 498 ++- src/armnn/test/ProfilerTests.cpp | 235 ++ src/armnn/test/ProfilingEventTest.cpp | 95 + src/armnn/test/RuntimeTests.cpp | 251 +- src/armnn/test/TensorHelpers.hpp | 12 +- src/armnn/test/TensorTest.cpp | 8 +- src/armnn/test/UnitTests.cpp | 2 +- src/armnn/test/UnitTests.hpp | 20 +- src/armnn/test/UtilsTests.cpp | 110 + src/armnnCaffeParser/CaffeParser.cpp | 1311 ++++--- src/armnnCaffeParser/CaffeParser.hpp | 141 +- src/armnnCaffeParser/CaffeSupport.md | 5 + .../RecordByRecordCaffeParser.cpp | 732 ++++ .../RecordByRecordCaffeParser.hpp | 53 + src/armnnCaffeParser/test/TestAdd.cpp | 2 +- src/armnnCaffeParser/test/TestConcat.cpp | 2 +- src/armnnCaffeParser/test/TestConvolution.cpp | 133 + src/armnnCaffeParser/test/TestDropout.cpp | 2 +- src/armnnCaffeParser/test/TestInPlace.cpp | 4 +- src/armnnCaffeParser/test/TestInputs.cpp | 22 +- src/armnnCaffeParser/test/TestMul.cpp | 2 +- .../test/TestMultiInputsOutputs.cpp | 2 +- src/armnnCaffeParser/test/TestPooling2d.cpp | 2 +- src/armnnCaffeParser/test/TestSplit.cpp | 2 +- src/armnnOnnxParser/OnnxParser.cpp | 1676 +++++++++ src/armnnOnnxParser/OnnxParser.hpp | 183 + src/armnnOnnxParser/OnnxSupport.md | 60 + src/armnnOnnxParser/README.md | 5 + src/armnnOnnxParser/test/Addition.cpp | 311 ++ src/armnnOnnxParser/test/BatchNorm.cpp | 342 ++ src/armnnOnnxParser/test/Const.cpp | 87 + src/armnnOnnxParser/test/Constructor.cpp | 16 + src/armnnOnnxParser/test/Conv2D.cpp | 469 +++ src/armnnOnnxParser/test/CreateNetwork.cpp | 63 + src/armnnOnnxParser/test/DepthConv.cpp | 162 + src/armnnOnnxParser/test/FullyConnected.cpp | 597 ++++ src/armnnOnnxParser/test/GetInputsOutputs.cpp | 255 ++ src/armnnOnnxParser/test/Pooling.cpp | 310 ++ src/armnnOnnxParser/test/ProtoxtFixture.cpp | 81 + src/armnnOnnxParser/test/Relu.cpp | 70 + src/armnnOnnxParser/test/Reshape.cpp | 110 + src/armnnTfLiteParser/README.md | 7 + .../TensorFlowLiteSupport.md | 27 + src/armnnTfLiteParser/TfLiteParser.cpp | 1440 ++++++++ src/armnnTfLiteParser/TfLiteParser.hpp | 156 + src/armnnTfLiteParser/test/AvgPool2D.cpp | 119 + src/armnnTfLiteParser/test/Conv2D.cpp | 351 ++ .../test/DepthwiseConvolution2D.cpp | 199 ++ src/armnnTfLiteParser/test/GetBuffer.cpp | 126 + .../test/GetInputsOutputs.cpp | 239 ++ .../test/GetSubgraphInputsOutputs.cpp | 230 ++ src/armnnTfLiteParser/test/GetTensorIds.cpp | 162 + .../test/InputOutputTensorNames.cpp | 138 + src/armnnTfLiteParser/test/LoadModel.cpp | 241 ++ .../test/OutputShapeOfSqueeze.cpp | 61 + .../test/ParserFlatbuffersFixture.hpp | 229 ++ src/armnnTfLiteParser/test/Softmax.cpp | 78 + src/armnnTfLiteParser/test/Squeeze.cpp | 144 + src/armnnTfParser/README.md | 2 +- src/armnnTfParser/TensorFlowSupport.md | 9 + src/armnnTfParser/TfParser.cpp | 927 +++-- src/armnnTfParser/TfParser.hpp | 48 +- src/armnnTfParser/test/Activations.cpp | 6 +- src/armnnTfParser/test/Addition.cpp | 2 +- src/armnnTfParser/test/BiasAdd.cpp | 2 +- src/armnnTfParser/test/BroadcastForAdd.cpp | 6 +- src/armnnTfParser/test/Concat.cpp | 2 +- src/armnnTfParser/test/ConcatOfConcats.cpp | 2 +- src/armnnTfParser/test/Constant.cpp | 20 +- src/armnnTfParser/test/Convolution2d.cpp | 15 +- .../test/DepthwiseConvolution2d.cpp | 2 +- src/armnnTfParser/test/FullyConnected.cpp | 38 +- src/armnnTfParser/test/FusedBatchNorm.cpp | 6 +- src/armnnTfParser/test/Identity.cpp | 6 +- .../test/LocalResponseNormalization.cpp | 3 +- .../test/MaximumForLeakyRelu.cpp | 169 + src/armnnTfParser/test/MultiOutput.cpp | 6 +- src/armnnTfParser/test/Multiplication.cpp | 4 +- src/armnnTfParser/test/PassThru.cpp | 4 +- src/armnnTfParser/test/Pooling.cpp | 3 +- src/armnnTfParser/test/Reshape.cpp | 3 +- src/armnnTfParser/test/ResizeBilinear.cpp | 6 +- src/armnnTfParser/test/Shape.cpp | 7 +- src/armnnTfParser/test/Softmax.cpp | 2 +- src/armnnTfParser/test/Squeeze.cpp | 3 +- src/armnnTfParser/test/TestDependencies.cpp | 26 +- .../test/TestMultiInputsOutputs.cpp | 10 +- src/armnnUtils/CsvReader.cpp | 63 + src/armnnUtils/CsvReader.hpp | 25 + src/armnnUtils/FloatingPointConverter.cpp | 44 + src/armnnUtils/FloatingPointConverter.hpp | 21 + src/armnnUtils/GraphTopologicalSort.hpp | 86 +- src/armnnUtils/HeapProfiling.hpp | 10 +- src/armnnUtils/LeakChecking.cpp | 19 + src/armnnUtils/LeakChecking.hpp | 21 +- src/armnnUtils/Logging.cpp | 2 +- src/armnnUtils/ParserFlatbuffersFixture.hpp | 11 - src/armnnUtils/ParserPrototxtFixture.hpp | 76 +- src/armnnUtils/Permute.cpp | 2 +- src/armnnUtils/VerificationHelpers.cpp | 74 + src/armnnUtils/VerificationHelpers.hpp | 35 + tests/CMakeLists.txt | 97 +- .../CaffeAlexNet-Armnn/CaffeAlexNet-Armnn.cpp | 13 +- .../CaffeCifar10AcrossChannels-Armnn.cpp | 11 +- .../CaffeInception_BN-Armnn.cpp | 13 +- tests/CaffeMnist-Armnn/CaffeMnist-Armnn.cpp | 11 +- ...eNetDatabase.cpp => CaffePreprocessor.cpp} | 12 +- ...eNetDatabase.hpp => CaffePreprocessor.hpp} | 13 +- tests/CaffeResNet-Armnn/CaffeResNet-Armnn.cpp | 14 +- .../CaffeSqueezeNet1_0-Armnn.cpp | 6 +- tests/CaffeVGG-Armnn/CaffeVGG-Armnn.cpp | 14 +- tests/CaffeYolo-Armnn/CaffeYolo-Armnn.cpp | 1 + tests/Cifar10Database.hpp | 3 +- tests/ExecuteNetwork/ExecuteNetwork.cpp | 518 ++- tests/ImagePreprocessor.cpp | 74 + tests/ImagePreprocessor.hpp | 73 + tests/InferenceModel.hpp | 270 +- tests/InferenceTest.cpp | 23 +- tests/InferenceTest.hpp | 44 +- tests/InferenceTest.inl | 54 +- tests/InferenceTestImage.cpp | 158 +- tests/InferenceTestImage.hpp | 25 +- tests/MnistDatabase.cpp | 8 +- tests/MnistDatabase.hpp | 3 +- tests/MobileNetDatabase.cpp | 133 - tests/MobileNetDatabase.hpp | 36 - .../MultipleNetworksCifar10.cpp | 30 +- tests/OnnxMnist-Armnn/OnnxMnist-Armnn.cpp | 39 + tests/OnnxMnist-Armnn/Validation.txt | 1000 ++++++ .../OnnxMobileNet-Armnn.cpp | 60 + tests/OnnxMobileNet-Armnn/Validation.txt | 201 ++ tests/OnnxMobileNet-Armnn/labels.txt | 1001 ++++++ tests/TfCifar10-Armnn/TfCifar10-Armnn.cpp | 12 +- .../TfInceptionV3-Armnn.cpp | 13 +- .../TfLiteMobilenetQuantized-Armnn.cpp | 84 + .../Validation.txt | 201 ++ .../TfLiteMobilenetQuantized-Armnn/labels.txt | 1001 ++++++ tests/TfMnist-Armnn/TfMnist-Armnn.cpp | 11 +- tests/TfMobileNet-Armnn/TfMobileNet-Armnn.cpp | 26 +- .../TfResNext_Quantized-Armnn.cpp | 13 +- tests/YoloDatabase.cpp | 8 +- tests/YoloInferenceTest.hpp | 12 +- third-party/half/ChangeLog.txt | 184 + third-party/half/LICENSE.txt | 21 + third-party/half/README.txt | 288 ++ third-party/half/half.hpp | 3068 +++++++++++++++++ 534 files changed, 37520 insertions(+), 5185 deletions(-) create mode 100644 BuildGuideCrossCompilation.md create mode 100644 include/armnn/IProfiler.hpp create mode 100644 include/armnn/LstmParams.hpp create mode 100644 include/armnnOnnxParser/IOnnxParser.hpp create mode 100644 include/armnnTfLiteParser/ITfLiteParser.hpp create mode 100644 samples/CMakeLists.txt create mode 100644 samples/SimpleSample.cpp create mode 100644 src/armnn/DeviceSpec.hpp create mode 100644 src/armnn/Half.hpp create mode 100644 src/armnn/IGraphObservable.hpp create mode 100644 src/armnn/Instrument.hpp create mode 100644 src/armnn/JsonPrinter.cpp create mode 100644 src/armnn/JsonPrinter.hpp create mode 100644 src/armnn/NeonInterceptorScheduler.cpp create mode 100644 src/armnn/NeonInterceptorScheduler.hpp create mode 100644 src/armnn/NeonTimer.cpp create mode 100644 src/armnn/NeonTimer.hpp create mode 100644 src/armnn/NetworkUtils.hpp create mode 100644 src/armnn/Observable.cpp create mode 100644 src/armnn/Observable.hpp create mode 100644 src/armnn/OpenClTimer.cpp create mode 100644 src/armnn/OpenClTimer.hpp create mode 100644 src/armnn/ProfilingEvent.cpp create mode 100644 src/armnn/ProfilingEvent.hpp create mode 100644 src/armnn/TypeUtils.hpp create mode 100644 src/armnn/WallClockTimer.cpp create mode 100644 src/armnn/WallClockTimer.hpp delete mode 100644 src/armnn/backends/AclBaseMemoryManager.cpp delete mode 100644 src/armnn/backends/AclBaseMemoryManager.hpp create mode 100644 src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp create mode 100644 src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp create mode 100644 src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp create mode 100644 src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp create mode 100644 src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp create mode 100644 src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp create mode 100644 src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp create mode 100644 src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp create mode 100644 src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp create mode 100644 src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp delete mode 100644 src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp create mode 100644 src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp create mode 100644 src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp create mode 100644 src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp create mode 100644 src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp create mode 100644 src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp create mode 100644 src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp create mode 100644 src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp create mode 100644 src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp create mode 100644 src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp create mode 100644 src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp create mode 100644 src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp create mode 100644 src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp create mode 100644 src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp create mode 100644 src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp create mode 100644 src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp create mode 100644 src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp create mode 100644 src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp create mode 100644 src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp create mode 100644 src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp create mode 100644 src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp create mode 100644 src/armnn/backends/WorkloadUtils.hpp create mode 100644 src/armnn/backends/test/ClContextControlFixture.hpp create mode 100644 src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp create mode 100644 src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp create mode 100644 src/armnn/backends/test/LayerReleaseConstantDataTest.cpp create mode 100644 src/armnn/backends/test/LstmTestImpl.hpp create mode 100644 src/armnn/layers/ConvertFp16ToFp32Layer.cpp create mode 100644 src/armnn/layers/ConvertFp16ToFp32Layer.hpp create mode 100644 src/armnn/layers/ConvertFp32ToFp16Layer.cpp create mode 100644 src/armnn/layers/ConvertFp32ToFp16Layer.hpp create mode 100644 src/armnn/layers/LstmLayer.cpp create mode 100644 src/armnn/layers/LstmLayer.hpp create mode 100644 src/armnn/memory/BaseMemoryManager.cpp create mode 100644 src/armnn/memory/BaseMemoryManager.hpp create mode 100644 src/armnn/memory/BlobLifetimeManager.cpp create mode 100644 src/armnn/memory/BlobLifetimeManager.hpp create mode 100644 src/armnn/memory/BlobMemoryPool.cpp create mode 100644 src/armnn/memory/BlobMemoryPool.hpp create mode 100644 src/armnn/memory/IMemoryPool.hpp create mode 100644 src/armnn/memory/IPoolManager.hpp create mode 100644 src/armnn/memory/OffsetLifetimeManager.cpp create mode 100644 src/armnn/memory/OffsetLifetimeManager.hpp create mode 100644 src/armnn/memory/OffsetMemoryPool.cpp create mode 100644 src/armnn/memory/OffsetMemoryPool.hpp create mode 100644 src/armnn/memory/PoolManager.cpp create mode 100644 src/armnn/memory/PoolManager.hpp create mode 100644 src/armnn/optimizations/ConvertConstants.hpp create mode 100644 src/armnn/optimizations/ConvertFp32NetworkToFp16.hpp create mode 100644 src/armnn/optimizations/OptimizeInverseConversions.hpp create mode 100644 src/armnn/test/CsvReaderTest.cpp create mode 100644 src/armnn/test/FP16SupportTest.cpp create mode 100644 src/armnn/test/FloatingPointConverterTest.cpp create mode 100644 src/armnn/test/InstrumentTests.cpp create mode 100644 src/armnn/test/JsonPrinterTests.cpp create mode 100644 src/armnn/test/NeonTimerTest.cpp create mode 100644 src/armnn/test/NetworkTests.cpp delete mode 100644 src/armnn/test/Network_test.cpp create mode 100644 src/armnn/test/ObservableTest.cpp create mode 100644 src/armnn/test/OpenClTimerTest.cpp create mode 100644 src/armnn/test/ProfilerTests.cpp create mode 100644 src/armnn/test/ProfilingEventTest.cpp create mode 100644 src/armnnCaffeParser/RecordByRecordCaffeParser.cpp create mode 100644 src/armnnCaffeParser/RecordByRecordCaffeParser.hpp create mode 100644 src/armnnCaffeParser/test/TestConvolution.cpp create mode 100644 src/armnnOnnxParser/OnnxParser.cpp create mode 100644 src/armnnOnnxParser/OnnxParser.hpp create mode 100644 src/armnnOnnxParser/OnnxSupport.md create mode 100644 src/armnnOnnxParser/README.md create mode 100644 src/armnnOnnxParser/test/Addition.cpp create mode 100644 src/armnnOnnxParser/test/BatchNorm.cpp create mode 100644 src/armnnOnnxParser/test/Const.cpp create mode 100644 src/armnnOnnxParser/test/Constructor.cpp create mode 100644 src/armnnOnnxParser/test/Conv2D.cpp create mode 100644 src/armnnOnnxParser/test/CreateNetwork.cpp create mode 100644 src/armnnOnnxParser/test/DepthConv.cpp create mode 100644 src/armnnOnnxParser/test/FullyConnected.cpp create mode 100644 src/armnnOnnxParser/test/GetInputsOutputs.cpp create mode 100644 src/armnnOnnxParser/test/Pooling.cpp create mode 100644 src/armnnOnnxParser/test/ProtoxtFixture.cpp create mode 100644 src/armnnOnnxParser/test/Relu.cpp create mode 100644 src/armnnOnnxParser/test/Reshape.cpp create mode 100644 src/armnnTfLiteParser/README.md create mode 100644 src/armnnTfLiteParser/TensorFlowLiteSupport.md create mode 100644 src/armnnTfLiteParser/TfLiteParser.cpp create mode 100644 src/armnnTfLiteParser/TfLiteParser.hpp create mode 100644 src/armnnTfLiteParser/test/AvgPool2D.cpp create mode 100644 src/armnnTfLiteParser/test/Conv2D.cpp create mode 100644 src/armnnTfLiteParser/test/DepthwiseConvolution2D.cpp create mode 100644 src/armnnTfLiteParser/test/GetBuffer.cpp create mode 100644 src/armnnTfLiteParser/test/GetInputsOutputs.cpp create mode 100644 src/armnnTfLiteParser/test/GetSubgraphInputsOutputs.cpp create mode 100644 src/armnnTfLiteParser/test/GetTensorIds.cpp create mode 100644 src/armnnTfLiteParser/test/InputOutputTensorNames.cpp create mode 100644 src/armnnTfLiteParser/test/LoadModel.cpp create mode 100644 src/armnnTfLiteParser/test/OutputShapeOfSqueeze.cpp create mode 100644 src/armnnTfLiteParser/test/ParserFlatbuffersFixture.hpp create mode 100644 src/armnnTfLiteParser/test/Softmax.cpp create mode 100644 src/armnnTfLiteParser/test/Squeeze.cpp create mode 100644 src/armnnTfParser/test/MaximumForLeakyRelu.cpp create mode 100644 src/armnnUtils/CsvReader.cpp create mode 100644 src/armnnUtils/CsvReader.hpp create mode 100644 src/armnnUtils/FloatingPointConverter.cpp create mode 100644 src/armnnUtils/FloatingPointConverter.hpp delete mode 100644 src/armnnUtils/ParserFlatbuffersFixture.hpp create mode 100644 src/armnnUtils/VerificationHelpers.cpp create mode 100644 src/armnnUtils/VerificationHelpers.hpp rename tests/{ImageNetDatabase.cpp => CaffePreprocessor.cpp} (74%) rename tests/{ImageNetDatabase.hpp => CaffePreprocessor.hpp} (73%) create mode 100644 tests/ImagePreprocessor.cpp create mode 100644 tests/ImagePreprocessor.hpp delete mode 100644 tests/MobileNetDatabase.cpp delete mode 100644 tests/MobileNetDatabase.hpp create mode 100644 tests/OnnxMnist-Armnn/OnnxMnist-Armnn.cpp create mode 100644 tests/OnnxMnist-Armnn/Validation.txt create mode 100644 tests/OnnxMobileNet-Armnn/OnnxMobileNet-Armnn.cpp create mode 100644 tests/OnnxMobileNet-Armnn/Validation.txt create mode 100644 tests/OnnxMobileNet-Armnn/labels.txt create mode 100644 tests/TfLiteMobilenetQuantized-Armnn/TfLiteMobilenetQuantized-Armnn.cpp create mode 100644 tests/TfLiteMobilenetQuantized-Armnn/Validation.txt create mode 100644 tests/TfLiteMobilenetQuantized-Armnn/labels.txt create mode 100644 third-party/half/ChangeLog.txt create mode 100644 third-party/half/LICENSE.txt create mode 100644 third-party/half/README.txt create mode 100644 third-party/half/half.hpp diff --git a/Android.mk b/Android.mk index f008840e30..e83000414f 100644 --- a/Android.mk +++ b/Android.mk @@ -31,32 +31,39 @@ LOCAL_EXPORT_C_INCLUDES := \ $(ARMNN_SOURCE_UTILS_HEADER_PATH) LOCAL_C_INCLUDES := \ - $(OPENCL_HEADER_PATH) \ - $(NN_HEADER_PATH) \ - $(ARMNN_HEADER_PATH) \ - $(ARMNN_SOURCE_HEADER_PATH) \ - $(ARMNN_SOURCE_UTILS_HEADER_PATH) + $(OPENCL_HEADER_PATH) \ + $(NN_HEADER_PATH) \ + $(ARMNN_HEADER_PATH) \ + $(ARMNN_SOURCE_HEADER_PATH) \ + $(ARMNN_SOURCE_UTILS_HEADER_PATH) LOCAL_SRC_FILES := \ + src/armnnUtils/DotSerializer.cpp \ + src/armnnUtils/FloatingPointConverter.cpp \ src/armnnUtils/Logging.cpp \ src/armnnUtils/Permute.cpp \ - src/armnnUtils/DotSerializer.cpp \ src/armnn/backends/ArmComputeTensorUtils.cpp \ src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp \ + src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp \ src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp \ + src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp \ src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp \ src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp \ + src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp \ + src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp \ src/armnn/backends/ClWorkloads/ClConvolution2dBaseWorkload.cpp \ src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp \ + src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp \ src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp \ src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp \ + src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp \ src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp \ @@ -68,6 +75,7 @@ LOCAL_SRC_FILES := \ src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp \ src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp \ + src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp \ src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp \ src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp \ src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp \ @@ -78,14 +86,18 @@ LOCAL_SRC_FILES := \ src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp \ + src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp \ + src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp \ src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp \ + src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp \ src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp \ + src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp \ @@ -96,6 +108,7 @@ LOCAL_SRC_FILES := \ src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp \ + src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp \ src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp \ src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp \ @@ -129,6 +142,7 @@ LOCAL_SRC_FILES := \ src/armnn/backends/RefWorkloads/Activation.cpp \ src/armnn/backends/RefWorkloads/RefReshapeUint8Workload.cpp \ src/armnn/backends/RefWorkloads/RefL2NormalizationFloat32Workload.cpp \ + src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp \ src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp \ src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp \ src/armnn/backends/RefWorkloads/RefSplitterFloat32Workload.cpp \ @@ -147,21 +161,25 @@ LOCAL_SRC_FILES := \ src/armnn/backends/RefWorkloads/RefMergerFloat32Workload.cpp \ src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp \ src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp \ + src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp \ + src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp \ src/armnn/backends/MemCopyWorkload.cpp \ src/armnn/backends/WorkloadData.cpp \ src/armnn/backends/WorkloadFactory.cpp \ - src/armnn/backends/AclBaseMemoryManager.cpp \ src/armnn/layers/ActivationLayer.cpp \ src/armnn/layers/AdditionLayer.cpp \ src/armnn/layers/BatchNormalizationLayer.cpp \ src/armnn/layers/ConstantLayer.cpp \ src/armnn/layers/Convolution2dLayer.cpp \ + src/armnn/layers/ConvertFp16ToFp32Layer.cpp \ + src/armnn/layers/ConvertFp32ToFp16Layer.cpp \ src/armnn/layers/DepthwiseConvolution2dLayer.cpp \ src/armnn/layers/FakeQuantizationLayer.cpp \ src/armnn/layers/FloorLayer.cpp \ src/armnn/layers/FullyConnectedLayer.cpp \ src/armnn/layers/InputLayer.cpp \ src/armnn/layers/L2NormalizationLayer.cpp \ + src/armnn/layers/LstmLayer.cpp \ src/armnn/layers/MemCopyLayer.cpp \ src/armnn/layers/MergerLayer.cpp \ src/armnn/layers/MultiplicationLayer.cpp \ @@ -182,20 +200,33 @@ LOCAL_SRC_FILES := \ src/armnn/InternalTypes.cpp \ src/armnn/Layer.cpp \ src/armnn/LoadedNetwork.cpp \ + src/armnn/NeonInterceptorScheduler.cpp \ + src/armnn/NeonTimer.cpp \ src/armnn/Network.cpp \ src/armnn/backends/OutputHandler.cpp \ + src/armnn/OpenClTimer.cpp \ + src/armnn/WallClockTimer.cpp \ + src/armnn/ProfilingEvent.cpp \ src/armnn/Profiling.cpp \ + src/armnn/JsonPrinter.cpp \ src/armnn/Tensor.cpp \ src/armnn/Utils.cpp \ src/armnn/LayerSupport.cpp \ + src/armnn/Observable.cpp \ src/armnn/backends/RefLayerSupport.cpp \ src/armnn/backends/ClLayerSupport.cpp \ src/armnn/backends/NeonLayerSupport.cpp \ src/armnn/backends/NeonWorkloadUtils.cpp \ - src/armnn/backends/NeonWorkloadFactory.cpp + src/armnn/backends/NeonWorkloadFactory.cpp \ + src/armnn/memory/BaseMemoryManager.cpp \ + src/armnn/memory/BlobLifetimeManager.cpp \ + src/armnn/memory/BlobMemoryPool.cpp \ + src/armnn/memory/OffsetLifetimeManager.cpp \ + src/armnn/memory/OffsetMemoryPool.cpp \ + src/armnn/memory/PoolManager.cpp LOCAL_STATIC_LIBRARIES := \ - armnn-arm_compute \ + armnn-arm_compute \ libboost_log \ libboost_system \ libboost_thread @@ -213,9 +244,20 @@ LOCAL_CFLAGS := \ include $(BUILD_STATIC_LIBRARY) +############### +# armnn-tests # +############### include $(CLEAR_VARS) -LOCAL_C_INCLUDES := \ +LOCAL_MODULE := armnn-tests +LOCAL_MODULE_TAGS := eng optional +LOCAL_ARM_MODE := arm +LOCAL_PROPRIETARY_MODULE := true + +# Mark source files as dependent on Android.mk +LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk + +LOCAL_C_INCLUDES := \ $(OPENCL_HEADER_PATH) \ $(NN_HEADER_PATH) \ $(ARMNN_HEADER_PATH) \ @@ -230,14 +272,19 @@ LOCAL_CFLAGS := \ -DARMCOMPUTECL_ENABLED \ -DARMCOMPUTENEON_ENABLED -LOCAL_SRC_FILES := \ +LOCAL_SRC_FILES := \ src/armnn/test/UnitTests.cpp \ src/armnn/test/EndToEndTest.cpp \ src/armnn/test/UtilsTests.cpp \ src/armnn/test/GraphTests.cpp \ src/armnn/test/RuntimeTests.cpp \ src/armnn/test/TensorTest.cpp \ - src/armnn/test/Network_test.cpp \ + src/armnn/test/NeonTimerTest.cpp \ + src/armnn/test/NetworkTests.cpp \ + src/armnn/test/InstrumentTests.cpp \ + src/armnn/test/OpenClTimerTest.cpp \ + src/armnn/test/ProfilingEventTest.cpp \ + src/armnn/test/ObservableTest.cpp \ src/armnn/backends/test/IsLayerSupportedTest.cpp \ src/armnn/backends/test/Reference.cpp \ src/armnn/backends/test/WorkloadDataValidation.cpp \ @@ -259,7 +306,7 @@ LOCAL_STATIC_LIBRARIES := \ libboost_thread \ armnn-arm_compute -LOCAL_SHARED_LIBRARIES := \ +LOCAL_SHARED_LIBRARIES := \ libbase \ libhidlbase \ libhidltransport \ @@ -271,18 +318,5 @@ LOCAL_SHARED_LIBRARIES := \ android.hidl.memory@1.0 \ libOpenCL -LOCAL_MODULE := armnn-tests - -LOCAL_MODULE_TAGS := eng optional - -LOCAL_ARM_MODE := arm - -# Mark source files as dependent on Android.mk -LOCAL_ADDITIONAL_DEPENDENCIES := $(LOCAL_PATH)/Android.mk - -LOCAL_PROPRIETARY_MODULE := true - include $(BUILD_EXECUTABLE) - - diff --git a/BuildGuideAndroidNDK.md b/BuildGuideAndroidNDK.md index 5d6f523632..8b2e2a86ba 100644 --- a/BuildGuideAndroidNDK.md +++ b/BuildGuideAndroidNDK.md @@ -164,8 +164,8 @@ All downloaded or generated files will be saved inside the `~/armnn-devenv` dire CC=aarch64-linux-android-clang \ CXX_FLAGS="-fPIE -fPIC" \ cmake .. \ - -DCMAKE_SYSTEM_NAME=Linux \ - -DCMAKE_EXE_LINKER_FLAGS=-pie \ + -DCMAKE_SYSTEM_NAME=Android \ + -DCMAKE_EXE_LINKER_FLAGS="-pie -llog" \ -DARMCOMPUTE_ROOT=$HOME/armnn-devenv/ComputeLibrary/ \ -DARMCOMPUTE_BUILD_DIR=$HOME/armnn-devenv/ComputeLibrary/build \ -DBOOST_ROOT=$HOME/armnn-devenv/boost/install/ \ @@ -181,11 +181,11 @@ All downloaded or generated files will be saved inside the `~/armnn-devenv` dire * Push the build results to an Android device and make symbolic links for shared libraries: ```bash - adb push libarmnnTfParser.so libarmnn.so UnitTests \ - $NDK/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_shared.so \ - /data/local/tmp/ - adb push $HOME/armnn-devenv/google/arm64_pb_install/lib/libprotobuf.so \ - /data/local/tmp/libprotobuf.so.15.0.1 + adb push libarmnnTfParser.so /data/local/tmp/ + adb push libarmnn.so /data/local/tmp/ + adb push UnitTests /data/local/tmp/ + adb push $NDK/sources/cxx-stl/llvm-libc++/libs/arm64-v8a/libc++_shared.so /data/local/tmp/ + adb push $HOME/armnn-devenv/google/arm64_pb_install/lib/libprotobuf.so /data/local/tmp/libprotobuf.so.15.0.1 adb shell 'ln -s libprotobuf.so.15.0.1 /data/local/tmp/libprotobuf.so.15' adb shell 'ln -s libprotobuf.so.15.0.1 /data/local/tmp/libprotobuf.so' ``` diff --git a/BuildGuideCrossCompilation.md b/BuildGuideCrossCompilation.md new file mode 100644 index 0000000000..df015a08f4 --- /dev/null +++ b/BuildGuideCrossCompilation.md @@ -0,0 +1,265 @@ +# How to Cross-Compile ArmNN on x86_64 for arm64 + +* [Introduction](#introduction) +* [Build and install Google's Protobuf library](#buildProtobuf) +* [Build Caffe for x86_64](#buildCaffe) +* [Cross-compiling ToolChain](#installCCT) +* [Build Boost library for arm64](#installBaarch) +* [Build Compute Library](#buildCL) +* [Build Compute Library](#buildCL) +* [Build ArmNN](#buildANN) +* [Run Unit Tests](#unittests) +* [Troubleshooting and Errors](#troubleshooting) + + +#### Introduction +These are the step by step instructions on Cross-Compiling ArmNN under an x86_64 system to target an Arm64 system. This build flow has been tested with Ubuntu 16.04. +The instructions show how to build the ArmNN core library and the Boost, Protobuf, Caffe and Compute Libraries necessary for compilation. + +#### Build and install Google's Protobuf library + +* Get protobuf-all-3.5.1.tar.gz from here: https://github.com/google/protobuf/releases +* Extract: + ```bash + tar -zxvf protobuf-all-3.5.1.tar.gz + cd protobuf-3.5.1 + ``` +* Build a native (x86_64) version of the protobuf libraries and compiler (protoc): + (Requires cUrl, autoconf, llibtool, and other build dependencies if not previously installed: sudo apt install curl autoconf libtool build-essential g++) + ``` + mkdir x86_64_build + cd x86_64_build + ../configure --prefix=$HOME/armnn-devenv/google/x86_64_pb_install + make install -j16 + cd .. + ``` +* Build the arm64 version of the protobuf libraries: + ``` + mkdir arm64_build + cd arm64_build + CC=aarch64-linux-gnu-gcc \ + CXX=aarch64-linux-gnu-g++ \ + ../configure --host=aarch64-linux \ + --prefix=$HOME/armnn-devenv/google/arm64_pb_install \ + --with-protoc=$HOME/armnn-devenv/google/x86_64_pb_install/bin/protoc + make install -j16 + cd .. + ``` + +#### Build Caffe for x86_64 +* Ubuntu 16.04 installation. These steps are taken from the full Caffe installation documentation at: http://caffe.berkeleyvision.org/install_apt.html +* Install dependencies: + ```bash + sudo apt-get install libleveldb-dev libsnappy-dev libopencv-dev libhdf5-serial-dev + sudo apt-get install --no-install-recommends libboost-all-dev + sudo apt-get install libgflags-dev libgoogle-glog-dev liblmdb-dev + sudo apt-get install libopenblas-dev + sudo apt-get install libatlas-base-dev + ``` +* Download Caffe-Master from: https://github.com/BVLC/caffe + ```bash + git clone https://github.com/BVLC/caffe.git + cd caffe + cp Makefile.config.example Makefile.config + ``` +* Adjust Makefile.config (for example, if using Anaconda Python, or if cuDNN is desired): + ``` + CPU only version - + CPU_ONLY := 1 + Add hdf5 and protobuf include and library directories (Replace $HOME with your actual /home/username dir) + INCLUDE_DIRS := $(PYTHON_INCLUDE) /usr/local/include /usr/include/hdf5/serial/ $HOME/armnn-devenv/google/x86_64_pb_install/include/ + LIBRARY_DIRS := $(PYTHON_LIB) /usr/local/lib /usr/lib /usr/lib/x86_64-linux-gnu/hdf5/serial/ $HOME/armnn-devenv/google/x86_64_pb_install/lib/ + g++ need to be version 5 + CUSTOM_CXX := g++-5 + ``` +* Setup environment: + ```bash + export PATH=$HOME/armnn-devenv/google/x86_64_pb_install/bin/:$PATH + export LD_LIBRARY_PATH=$HOME/armnn-devenv/google/x86_64_pb_install/lib/:LD_LIBRARY_PATH + ``` +* Compilation with Make: + ```bash + make all + make test + make runtest + ``` + These should all run without errors +* caffe.pb.h and caffe.pb.cc will be needed when building ArmNN's Caffe Parser + +#### Cross-compiling ToolChain +* Install the standard cross-compilation libraries for arm64: + ``` + sudo apt install crossbuild-essential-arm64 + ``` +#### Build Boost library for arm64 +* Build Boost library for arm64 + Download Boost version 1.64 from http://www.boost.org/doc/libs/1_64_0/more/getting_started/unix-variants.html + Version 1.66 is not supported. + ```bash + tar -zxvf boost_1_64_0.tar.gz + cd boost_1_64_0 + echo "using gcc : arm : aarch64-linux-gnu-g++ ;" > user_config.jam + ./bootstrap.sh --prefix=$HOME/armnn-devenv/boost_arm64_install + ./b2 install toolset=gcc-arm link=static cxxflags=-fPIC --with-filesystem --with-test --with-log --with-program_options -j32 --user-config=user_config.jam + ``` + +#### Build Compute Library +* Building the Arm Compute Library: + ```bash + git clone https://github.com/ARM-software/ComputeLibrary.git + cd ComputeLibrary/ + scons arch=arm64-v8a neon=1 opencl=1 embed_kernels=1 extra_cxx_flags="-fPIC" -j8 internal_only=0 + ``` + +#### Build ArmNN +* Compile ArmNN for arm64: + ```bash + git clone https://github.com/ARM-software/armnn.git + cd armnn + mkdir build + cd build + ``` + +* Use CMake to configure your build environment, update the following script and run it from the armnn/build directory to set up the armNN build: + ```bash + #!/bin/bash + CXX=aarch64-linux-gnu-g++ \ + CC=aarch64-linux-gnu-gcc \ + cmake .. \ + -DARMCOMPUTE_ROOT=$HOME/armnn-devenv/ComputeLibrary \ + -DARMCOMPUTE_BUILD_DIR=$HOME/armnn-devenv/ComputeLibrary/build/ \ + -DBOOST_ROOT=$HOME/armnn-devenv/boost_arm64_install/ \ + -DARMCOMPUTENEON=1 -DARMCOMPUTECL=1 \ + -DCAFFE_GENERATED_SOURCES=$HOME/armnn-devenv/caffe/build/src \ + -DBUILD_CAFFE_PARSER=1 \ + -DPROTOBUF_ROOT=$HOME/armnn-devenv/google/x86_64_pb_install/ \ + -DPROTOBUF_LIBRARY_DEBUG=$HOME/armnn-devenv/google/arm64_pb_install/lib/libprotobuf.so.15.0.1 \ + -DPROTOBUF_LIBRARY_RELEASE=$HOME/armnn-devenv/google/arm64_pb_install/lib/libprotobuf.so.15.0.1 + ``` +* Run the build + ```bash + make -j32 + ``` + +#### Run Unit Tests +* Copy the build folder to an arm64 linux machine +* Copy the libprotobuf.so.15.0.1 library file to the build folder +* cd to the build folder on your arm64 machine and set your LD_LIBRARY_PATH to its current location: + ``` + cd build/ + export LD_LIBRARY_PATH=`pwd` + ``` +* Run the UnitTests: + ``` + ./UnitTests + Running 567 test cases... + + *** No errors detected + ``` +#### Troubleshooting and Errors: +#### Error adding symbols: File in wrong format +* When building armNN: + ``` + /usr/local/lib/libboost_log.a: error adding symbols: File in wrong format + collect2: error: ld returned 1 exit status + CMakeFiles/armnn.dir/build.make:4028: recipe for target 'libarmnn.so' failed + make[2]: *** [libarmnn.so] Error 1 + CMakeFiles/Makefile2:105: recipe for target 'CMakeFiles/armnn.dir/all' failed + make[1]: *** [CMakeFiles/armnn.dir/all] Error 2 + Makefile:127: recipe for target 'all' failed + make: *** [all] Error 2 + ``` +* Boost libraries are not compiled for the correct architecture, try recompiling for arm64 +## +#### Virtual memory exhausted +* When compiling the boost libraries: + ```bash + virtual memory exhausted: Cannot allocate memory + ``` +* Not enough memory available to compile. Increase the amount of RAM or swap space available. + +## +#### Unrecognized command line option '-m64' +* When compiling the boost libraries: + ```bash + aarch64-linux-gnu-g++: error: unrecognized command line option ‘-m64’ + ``` +* Clean the boost library directory before trying to build with a different architecture: + ```bash + sudo ./b2 clean + ``` +* It should show the following for arm64: + ```bash + - 32-bit : no + - 64-bit : yes + - arm : yes + ``` + +## +#### Missing libz.so.1 +* When compiling armNN: + ```bash + /usr/lib/gcc-cross/aarch64-linux-gnu/5/../../../../aarch64-linux-gnu/bin/ld: warning: libz.so.1, needed by /home//armNN/usr/lib64/libprotobuf.so.15.0.0, not found (try using -rpath or -rpath-link) + ``` + +* Missing arm64 libraries for libz.so.1, these can be added by adding a second architecture to dpkg and explicitely installing them: + ```bash + sudo dpkg --add-architecture arm64 + sudo apt-get install zlib1g:arm64 + sudo apt-get update + sudo ldconfig + ``` +* If apt-get update returns 404 errors for arm64 repos refer to section 5 below. +* Alternatively the missing arm64 version of libz.so.1 can be downloaded and installed from a .deb package here: + https://launchpad.net/ubuntu/wily/arm64/zlib1g/1:1.2.8.dfsg-2ubuntu4 + ```bash + sudo dpkg -i zlib1g_1.2.8.dfsg-2ubuntu4_arm64.deb + ``` +## +#### Unable to install arm64 packages after adding arm64 architecture +* Using sudo apt-get update should add all of the required repos for arm64 but if it does not or you are getting 404 errors the following instructions can be used to add the repos manually: +* From stackoverflow: +https://askubuntu.com/questions/430705/how-to-use-apt-get-to-download-multi-arch-library/430718 +* Open /etc/apt/sources.list with your preferred text editor. + +* Mark all the current (default) repos as \[arch=], e.g. + ```bash + deb [arch=amd64] http://archive.ubuntu.com/ubuntu/ xenial main restricted + ``` +* Then add the following: + ```bash + deb [arch=arm64] http://ports.ubuntu.com/ xenial main restricted + deb [arch=arm64] http://ports.ubuntu.com/ xenial-updates main restricted + deb [arch=arm64] http://ports.ubuntu.com/ xenial universe + deb [arch=arm64] http://ports.ubuntu.com/ xenial-updates universe + deb [arch=arm64] http://ports.ubuntu.com/ xenial multiverse + deb [arch=arm64] http://ports.ubuntu.com/ xenial-updates multiverse + deb [arch=arm64] http://ports.ubuntu.com/ xenial-backports main restricted universe multiverse + ``` +* Update and install again: + ```bash + sudo apt-get install zlib1g:arm64 + sudo apt-get update + sudo ldconfig + ``` +## +#### Undefined references to google::protobuf:: functions +* When compiling armNN there are multiple errors of the following type: + ``` + libarmnnCaffeParser.so: undefined reference to `google::protobuf:* + ``` +* Missing or out of date protobuf compilation libraries. + Use the command 'protoc --version' to check which version of protobuf is available (version 3.5.1 is required). + Follow the instructions above to install protobuf 3.5.1 + Note this will require you to recompile Caffe for x86_64 + +## +#### Errors on strict-aliasing rules when compiling the Compute Library +* When compiling the Compute Library there are multiple errors on strict-aliasing rules: + ``` + cc1plus: error: unrecognized command line option ‘-Wno-implicit-fallthrough’ [-Werror] + ``` +* Add Werror=0 to the scons command: + ``` + scons arch=arm64-v8a neon=1 opencl=1 embed_kernels=1 extra_cxx_flags="-fPIC" -j8 internal_only=0 Werror=0 + ``` \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index f40a21c10a..c06a869af5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,8 @@ if (BUILD_TESTS) add_subdirectory(tests) endif() +add_subdirectory(samples) + # Include the additional cmake files in their own target so that they will appear nicely in IDEs add_custom_target(AdditionalCMakeFiles SOURCES ${additional_cmake_files}) @@ -31,6 +33,12 @@ list(APPEND armnnUtils_sources src/armnnUtils/HeapProfiling.hpp src/armnnUtils/LeakChecking.cpp src/armnnUtils/LeakChecking.hpp + src/armnnUtils/CsvReader.cpp + src/armnnUtils/CsvReader.hpp + src/armnnUtils/FloatingPointConverter.cpp + src/armnnUtils/FloatingPointConverter.hpp + src/armnnUtils/VerificationHelpers.hpp + src/armnnUtils/VerificationHelpers.cpp ) if(BUILD_TF_PARSER OR BUILD_CAFFE_PARSER) list(APPEND armnnUtils_sources @@ -45,6 +53,8 @@ if(BUILD_CAFFE_PARSER) set(armnn_caffe_parser_sources) list(APPEND armnn_caffe_parser_sources include/armnnCaffeParser/ICaffeParser.hpp + src/armnnCaffeParser/RecordByRecordCaffeParser.hpp + src/armnnCaffeParser/RecordByRecordCaffeParser.cpp src/armnnCaffeParser/CaffeParser.hpp src/armnnCaffeParser/CaffeParser.cpp ${CAFFE_GENERATED_SOURCES}/caffe/proto/caffe.pb.cc @@ -63,6 +73,30 @@ if(BUILD_CAFFE_PARSER) target_link_libraries(armnnCaffeParser armnn) target_link_libraries(armnnCaffeParser ${PROTOBUF_LIBRARIES}) + +endif() + +if(BUILD_ONNX_PARSER) + set(armnn_onnx_parser_sources) + list(APPEND armnn_onnx_parser_sources + include/armnnOnnxParser/IOnnxParser.hpp + src/armnnOnnxParser/OnnxParser.hpp + src/armnnOnnxParser/OnnxParser.cpp + ${ONNX_GENERATED_SOURCES}/onnx/onnx.pb.cc + ) + # The generated onnx protobuf .cc files are not warning clean and we can't fix them. + if(COMPILER_IS_GNU_LIKE) + set_source_files_properties(${ONNX_GENERATED_SOURCES}/onnx/onnx.pb.cc PROPERTIES COMPILE_FLAGS "-Wno-conversion -Wno-sign-conversion") + endif() + + add_library_ex(armnnOnnxParser SHARED ${armnn_onnx_parser_sources}) + + target_include_directories(armnnOnnxParser PRIVATE src/armnnUtils) + + target_link_libraries(armnnOnnxParser armnn) + + # Protobuf + target_link_libraries(armnnOnnxParser ${PROTOBUF_LIBRARIES}) endif() if(BUILD_TF_PARSER) @@ -88,7 +122,25 @@ if(BUILD_TF_PARSER) target_link_libraries(armnnTfParser ${PROTOBUF_LIBRARIES}) endif() +if(BUILD_TF_LITE_PARSER) + set(armnn_tf_lite_parser_sources) + list(APPEND armnn_tf_lite_parser_sources + include/armnnTfLiteParser/ITfLiteParser.hpp + src/armnnTfLiteParser/TfLiteParser.hpp + src/armnnTfLiteParser/TfLiteParser.cpp + ) + + add_library_ex(armnnTfLiteParser SHARED ${armnn_tf_lite_parser_sources}) + + target_include_directories(armnnTfLiteParser PRIVATE src/armnnUtils) + + target_link_libraries(armnnTfLiteParser ${Boost_FILESYSTEM_LIBRARY} ${Boost_THREAD_LIBRARY}) + target_link_libraries(armnnTfLiteParser armnn ${FLATBUFFERS_LIBRARY}) +endif() + # ArmNN source files required for all build options +include_directories(SYSTEM third-party) + list(APPEND armnn_sources include/armnn/ArmNN.hpp include/armnn/Descriptors.hpp @@ -126,9 +178,8 @@ list(APPEND armnn_sources src/armnn/backends/WorkloadData.cpp src/armnn/backends/WorkloadFactory.hpp src/armnn/backends/WorkloadFactory.cpp - src/armnn/backends/AclBaseMemoryManager.hpp - src/armnn/backends/AclBaseMemoryManager.cpp src/armnn/backends/WorkloadInfo.hpp + src/armnn/backends/WorkloadUtils.hpp src/armnn/backends/MemCopyWorkload.cpp src/armnn/backends/MemCopyWorkload.hpp src/armnn/backends/RefWorkloads/Broadcast.hpp @@ -222,6 +273,12 @@ list(APPEND armnn_sources src/armnn/backends/RefWorkloads/RefFakeQuantizationFloat32Workload.hpp src/armnn/backends/RefWorkloads/RefPermuteWorkload.hpp src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp + src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp + src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp + src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp + src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp + src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp + src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp src/armnn/layers/LayerCloneBase.hpp src/armnn/layers/LayerWithParameters.hpp src/armnn/layers/ActivationLayer.hpp @@ -234,6 +291,10 @@ list(APPEND armnn_sources src/armnn/layers/ConstantLayer.cpp src/armnn/layers/Convolution2dLayer.hpp src/armnn/layers/Convolution2dLayer.cpp + src/armnn/layers/ConvertFp16ToFp32Layer.hpp + src/armnn/layers/ConvertFp16ToFp32Layer.cpp + src/armnn/layers/ConvertFp32ToFp16Layer.hpp + src/armnn/layers/ConvertFp32ToFp16Layer.cpp src/armnn/layers/DepthwiseConvolution2dLayer.hpp src/armnn/layers/DepthwiseConvolution2dLayer.cpp src/armnn/layers/FakeQuantizationLayer.hpp @@ -246,6 +307,8 @@ list(APPEND armnn_sources src/armnn/layers/InputLayer.cpp src/armnn/layers/L2NormalizationLayer.hpp src/armnn/layers/L2NormalizationLayer.cpp + src/armnn/layers/LstmLayer.cpp + src/armnn/layers/LstmLayer.hpp src/armnn/layers/MemCopyLayer.hpp src/armnn/layers/MemCopyLayer.cpp src/armnn/layers/MergerLayer.hpp @@ -268,8 +331,11 @@ list(APPEND armnn_sources src/armnn/layers/SoftmaxLayer.cpp src/armnn/layers/SplitterLayer.hpp src/armnn/layers/SplitterLayer.cpp + src/armnn/Half.hpp src/armnn/InternalTypes.hpp src/armnn/InternalTypes.cpp + src/armnn/JsonPrinter.hpp + src/armnn/JsonPrinter.cpp src/armnn/LayerFwd.hpp src/armnn/Layer.hpp src/armnn/Layer.cpp @@ -279,6 +345,7 @@ list(APPEND armnn_sources src/armnn/SerializeLayerParameters.cpp src/armnn/SerializeLayerParameters.hpp src/armnn/Descriptors.cpp + src/armnn/DeviceSpec.hpp src/armnn/LoadedNetwork.hpp src/armnn/LoadedNetwork.cpp src/armnn/Exceptions.cpp @@ -286,22 +353,35 @@ list(APPEND armnn_sources src/armnn/Graph.cpp src/armnn/Network.hpp src/armnn/Network.cpp + src/armnn/NetworkUtils.hpp src/armnn/backends/OutputHandler.hpp src/armnn/backends/OutputHandler.cpp + src/armnn/ProfilingEvent.cpp + src/armnn/ProfilingEvent.hpp src/armnn/Profiling.cpp + src/armnn/Instrument.hpp + src/armnn/WallClockTimer.hpp + src/armnn/WallClockTimer.cpp src/armnn/Tensor.cpp src/armnn/Utils.cpp src/armnn/LayerSupport.cpp src/armnn/LayerSupportCommon.hpp src/armnn/optimizations/All.hpp + src/armnn/optimizations/ConvertConstants.hpp src/armnn/optimizations/MovePermuteUp.hpp src/armnn/optimizations/Optimization.hpp src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp src/armnn/optimizations/OptimizeInversePermutes.hpp src/armnn/optimizations/PermuteAsReshape.hpp src/armnn/optimizations/SquashEqualSiblings.hpp + src/armnn/optimizations/OptimizeInverseConversions.hpp + src/armnn/optimizations/ConvertFp32NetworkToFp16.hpp src/armnn/Optimizer.hpp src/armnn/Optimizer.cpp + third-party/half/half.hpp + src/armnn/IGraphObservable.hpp + src/armnn/Observable.hpp + src/armnn/Observable.cpp ) if(ARMCOMPUTENEON) @@ -322,12 +402,18 @@ if(ARMCOMPUTENEON) src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.hpp + src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp + src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp + src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp + src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.hpp + src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp + src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp @@ -338,6 +424,8 @@ if(ARMCOMPUTENEON) src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp + src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp + src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp @@ -358,6 +446,8 @@ if(ARMCOMPUTENEON) src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.hpp + src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp + src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp @@ -368,7 +458,11 @@ if(ARMCOMPUTENEON) src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.hpp src/armnn/backends/NeonWorkloadUtils.cpp src/armnn/backends/NeonWorkloadUtils.hpp - src/armnn/backends/NeonTensorHandle.hpp) + src/armnn/backends/NeonTensorHandle.hpp + src/armnn/NeonInterceptorScheduler.hpp + src/armnn/NeonInterceptorScheduler.cpp + src/armnn/NeonTimer.hpp + src/armnn/NeonTimer.cpp) endif() if(ARMCOMPUTECL) # Additionally include source files for ARM Compute OpenCL backend @@ -377,8 +471,16 @@ if(ARMCOMPUTECL) src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp + src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp + src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp + src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp + src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp + src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp + src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp + src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp + src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp @@ -394,17 +496,20 @@ if(ARMCOMPUTECL) src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp + src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp + src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp - src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp + src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp + src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp @@ -427,6 +532,8 @@ if(ARMCOMPUTECL) src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.hpp src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp + src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp + src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp @@ -436,14 +543,29 @@ if(ARMCOMPUTECL) src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.hpp src/armnn/backends/ClWorkloadUtils.hpp - src/armnn/backends/ClTensorHandle.hpp) + src/armnn/backends/ClTensorHandle.hpp + src/armnn/OpenClTimer.cpp + src/armnn/OpenClTimer.hpp) endif() # Files shared by all ARM Compute backends if(ARMCOMPUTENEON OR ARMCOMPUTECL) list(APPEND armnn_sources src/armnn/backends/ArmComputeTensorUtils.hpp src/armnn/backends/ArmComputeTensorUtils.cpp - src/armnn/backends/ArmComputeUtils.hpp) + src/armnn/backends/ArmComputeUtils.hpp + src/armnn/memory/IMemoryPool.hpp + src/armnn/memory/BlobMemoryPool.cpp + src/armnn/memory/BlobMemoryPool.hpp + src/armnn/memory/BlobLifetimeManager.cpp + src/armnn/memory/BlobLifetimeManager.hpp + src/armnn/memory/PoolManager.cpp + src/armnn/memory/PoolManager.hpp + src/armnn/memory/BaseMemoryManager.hpp + src/armnn/memory/BaseMemoryManager.cpp + src/armnn/memory/OffsetMemoryPool.cpp + src/armnn/memory/OffsetMemoryPool.hpp + src/armnn/memory/OffsetLifetimeManager.cpp + src/armnn/memory/OffsetLifetimeManager.hpp) endif() # Files used for Streamline-based profiling backend @@ -459,13 +581,20 @@ target_include_directories(armnn PRIVATE src/armnnUtils) target_link_libraries(armnn armnnUtils) target_link_libraries(armnn ${CMAKE_DL_LIBS}) + install(TARGETS armnn DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) if(BUILD_CAFFE_PARSER) install(TARGETS armnnCaffeParser DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) endif() +if(BUILD_ONNX_PARSER) + install(TARGETS armnnOnnxParser DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) +endif() if(BUILD_TF_PARSER) install(TARGETS armnnTfParser DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) endif() +if(BUILD_TF_LITE_PARSER) + install(TARGETS armnnTfLiteParser DESTINATION ${CMAKE_INSTALL_PREFIX}/lib) +endif() install(DIRECTORY include/ DESTINATION ${CMAKE_INSTALL_PREFIX}/include) target_link_libraries(armnn ${Boost_LOG_LIBRARY} ${Boost_THREAD_LIBRARY} ${Boost_SYSTEM_LIBRARY}) @@ -488,14 +617,21 @@ if(BUILD_UNIT_TESTS) src/armnn/test/UnitTests.hpp src/armnn/test/EndToEndTest.cpp src/armnn/test/UtilsTests.cpp + src/armnn/test/JsonPrinterTests.cpp src/armnn/test/GraphTests.cpp src/armnn/test/OptimizerTests.cpp + src/armnn/test/ProfilerTests.cpp src/armnn/test/RuntimeTests.cpp src/armnn/test/CreateWorkload.hpp src/armnn/test/TensorTest.cpp src/armnn/test/TensorHelpers.hpp - src/armnn/test/Network_test.cpp + src/armnn/test/CsvReaderTest.cpp + src/armnn/test/NetworkTests.cpp + src/armnn/test/FloatingPointConverterTest.cpp + src/armnn/test/ProfilingEventTest.cpp src/armnn/test/GraphUtils.hpp + src/armnn/test/InstrumentTests.cpp + src/armnn/test/ObservableTest.cpp src/armnn/backends/test/IsLayerSupportedTest.cpp src/armnn/backends/test/IsLayerSupportedTestImpl.hpp src/armnn/backends/test/Reference.cpp @@ -504,6 +640,7 @@ if(BUILD_UNIT_TESTS) src/armnn/backends/test/TensorCopyUtils.cpp src/armnn/backends/test/LayerTests.hpp src/armnn/backends/test/LayerTests.cpp + src/armnn/backends/test/LayerReleaseConstantDataTest.cpp src/armnn/backends/test/Conv2dTestImpl.hpp src/armnn/backends/test/ActivationTestImpl.hpp src/armnn/backends/test/ActivationFixture.hpp @@ -522,14 +659,18 @@ if(BUILD_UNIT_TESTS) list(APPEND unittest_sources src/armnn/backends/test/ArmComputeNeon.cpp src/armnn/backends/test/CreateWorkloadNeon.cpp - src/armnn/test/CreateWorkloadClNeon.hpp) + src/armnn/test/CreateWorkloadClNeon.hpp + src/armnn/test/NeonTimerTest.cpp) endif() if(ARMCOMPUTECL) list(APPEND unittest_sources src/armnn/backends/test/ArmComputeCl.cpp + src/armnn/backends/test/ClContextControlFixture.hpp src/armnn/backends/test/CreateWorkloadCl.cpp - src/armnn/test/CreateWorkloadClNeon.hpp) + src/armnn/test/CreateWorkloadClNeon.hpp + src/armnn/test/OpenClTimerTest.cpp + src/armnn/test/FP16SupportTest.cpp) endif() if(ARMCOMPUTENEON OR ARMCOMPUTECL) @@ -550,6 +691,7 @@ if(BUILD_UNIT_TESTS) src/armnnTfParser/test/FusedBatchNorm.cpp src/armnnTfParser/test/Identity.cpp src/armnnTfParser/test/LocalResponseNormalization.cpp + src/armnnTfParser/test/MaximumForLeakyRelu.cpp src/armnnTfParser/test/Multiplication.cpp src/armnnTfParser/test/MultiOutput.cpp src/armnnTfParser/test/PassThru.cpp @@ -565,10 +707,29 @@ if(BUILD_UNIT_TESTS) src/armnnTfParser/test/Squeeze.cpp) endif() + if(BUILD_TF_LITE_PARSER) + list(APPEND unittest_sources + src/armnnTfLiteParser/test/ParserFlatbuffersFixture.hpp + src/armnnTfLiteParser/test/AvgPool2D.cpp + src/armnnTfLiteParser/test/Conv2D.cpp + src/armnnTfLiteParser/test/DepthwiseConvolution2D.cpp + src/armnnTfLiteParser/test/Softmax.cpp + src/armnnTfLiteParser/test/Squeeze.cpp + src/armnnTfLiteParser/test/LoadModel.cpp + src/armnnTfLiteParser/test/GetBuffer.cpp + src/armnnTfLiteParser/test/OutputShapeOfSqueeze.cpp + src/armnnTfLiteParser/test/InputOutputTensorNames.cpp + src/armnnTfLiteParser/test/GetTensorIds.cpp + src/armnnTfLiteParser/test/GetSubgraphInputsOutputs.cpp + src/armnnTfLiteParser/test/GetInputsOutputs.cpp + ) + endif() + if(BUILD_CAFFE_PARSER) list(APPEND unittest_sources src/armnnCaffeParser/test/TestAdd.cpp src/armnnCaffeParser/test/TestConcat.cpp + src/armnnCaffeParser/test/TestConvolution.cpp src/armnnCaffeParser/test/TestDropout.cpp src/armnnCaffeParser/test/TestInputs.cpp src/armnnCaffeParser/test/TestMul.cpp @@ -579,19 +740,41 @@ if(BUILD_UNIT_TESTS) ) endif() + if(BUILD_ONNX_PARSER) + list(APPEND unittest_sources + src/armnnOnnxParser/test/Constructor.cpp + src/armnnOnnxParser/test/CreateNetwork.cpp + src/armnnOnnxParser/test/ProtoxtFixture.cpp + src/armnnOnnxParser/test/Const.cpp + src/armnnOnnxParser/test/Pooling.cpp + src/armnnOnnxParser/test/Reshape.cpp + src/armnnOnnxParser/test/Relu.cpp + src/armnnOnnxParser/test/Conv2D.cpp + src/armnnOnnxParser/test/Addition.cpp + src/armnnOnnxParser/test/FullyConnected.cpp + src/armnnOnnxParser/test/GetInputsOutputs.cpp + src/armnnOnnxParser/test/BatchNorm.cpp + src/armnnOnnxParser/test/DepthConv.cpp + ) + endif() + add_executable_ex(UnitTests ${unittest_sources}) target_include_directories(UnitTests PRIVATE src/armnn) target_include_directories(UnitTests PRIVATE src/armnnUtils) - if(NOT HEAP_PROFILING AND VALGRIND_FOUND) - # Valgrind works with gperftools version number <= 2.4 - target_compile_definitions(UnitTests PRIVATE "WITH_VALGRIND=1") + if(VALGRIND_FOUND) + if(HEAP_PROFILING OR LEAK_CHECKING) + message("Valgrind is disabled for heap profiling and leak checking builds.") + else() + # Valgrind works with gperftools version number <= 2.4 + target_compile_definitions(UnitTests PRIVATE "WITH_VALGRIND=1") + endif() endif() target_link_libraries(UnitTests armnn) target_link_libraries(UnitTests armnnUtils) target_link_libraries(UnitTests ${CMAKE_THREAD_LIBS_INIT}) - target_link_libraries(UnitTests ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY}) + target_link_libraries(UnitTests ${Boost_UNIT_TEST_FRAMEWORK_LIBRARY} ${Boost_SYSTEM_LIBRARY} ${Boost_FILESYSTEM_LIBRARY}) if(BUILD_TF_PARSER) target_link_libraries(UnitTests armnnTfParser) @@ -601,6 +784,13 @@ if(BUILD_UNIT_TESTS) target_link_libraries(UnitTests armnnCaffeParser) endif() + if(BUILD_TF_LITE_PARSER) + target_link_libraries(UnitTests armnnTfLiteParser) + endif() + + if(BUILD_ONNX_PARSER) + target_link_libraries(UnitTests armnnOnnxParser) + endif() + addDllCopyCommands(UnitTests) endif() - diff --git a/README.md b/README.md index e451cb1754..72f5a1faea 100644 --- a/README.md +++ b/README.md @@ -4,14 +4,24 @@ For more information about Arm NN, see: +There is a getting started guide here using TensorFlow Lite: [TensorFlow Lite Support](src/armnnTfLiteParser/README.md) + There is a getting started guide here using Caffe: +There is a getting started guide here using ONNX: [ONNX Support](src/armnnOnnxParser/README.md) + ### Build Instructions Arm tests the build system of Arm NN with the following build environments: * Android NDK: [How to use Android NDK to build ArmNN](BuildGuideAndroidNDK.md) -* Cross compilation from x86_64 Ubuntu to arm64 Linux +* Cross compilation from x86_64 Ubuntu to arm64 Linux: [ArmNN Cross Compilation](BuildGuideCrossCompilation.md) * Native compilation under arm64 Debian 9 Arm NN is written using portable C++14 and the build system uses [CMake](https://cmake.org/) so it is possible to build for a wide variety of target platforms, from a wide variety of host environments. + +The armnn/tests directory contains tests used during ArmNN development. Many of them depend on third-party IP, model protobufs and image files not distributed with ArmNN. The dependencies of some of the tests are available freely on the Internet, for those who wish to experiment. + +The 'ExecuteNetwork' program, in armnn/tests/ExecuteNetwork, has no additional dependencies beyond those required by ArmNN and the model parsers. It takes any model and any input tensor, and simply prints out the output tensor. Run with no arguments to see command-line help. + +The 'armnn/samples' directory contains SimpleSample.cpp. A very basic example of the ArmNN SDK API in use. \ No newline at end of file diff --git a/cmake/GlobalConfig.cmake b/cmake/GlobalConfig.cmake index 2dbeadaadf..47bdd5ca32 100644 --- a/cmake/GlobalConfig.cmake +++ b/cmake/GlobalConfig.cmake @@ -1,15 +1,20 @@ option(BUILD_CAFFE_PARSER "Build Caffe parser" OFF) option(BUILD_TF_PARSER "Build Tensorflow parser" OFF) +option(BUILD_ONNX_PARSER "Build Onnx parser" OFF) option(BUILD_UNIT_TESTS "Build unit tests" ON) option(BUILD_TESTS "Build test applications" OFF) option(BUILD_FOR_COVERAGE "Use no optimization and output .gcno and .gcda files" OFF) option(ARMCOMPUTENEON "Build with ARM Compute NEON support" OFF) option(ARMCOMPUTECL "Build with ARM Compute OpenCL support" OFF) -option(PROFILING "Build with ArmNN built-in profiling support" OFF) option(PROFILING_BACKEND_STREAMLINE "Forward the armNN profiling events to DS-5/Streamline as annotations" OFF) -# options used for heap profiling +# options used for heap profiling and leak checking option(HEAP_PROFILING "Build with heap profiling enabled" OFF) +option(LEAK_CHECKING "Build with leak checking enabled" OFF) option(GPERFTOOLS_ROOT "Location where the gperftools 'include' and 'lib' folders to be found" Off) +# options used for tensorflow lite support +option(BUILD_TF_LITE_PARSER "Build Tensorflow Lite parser" OFF) +option(TF_LITE_GENERATED_PATH "Tensorflow lite generated C++ schema location" OFF) +option(FLATBUFFERS_ROOT "Location where the flatbuffers 'include' and 'lib' folders to be found" Off) include(SelectLibraryConfigurations) @@ -106,7 +111,7 @@ link_directories(${Boost_LIBRARY_DIR}) find_package (Threads) # Favour the protobuf passed on command line -if(BUILD_TF_PARSER OR BUILD_CAFFE_PARSER) +if(BUILD_TF_PARSER OR BUILD_CAFFE_PARSER OR BUILD_ONNX_PARSER) find_library(PROTOBUF_LIBRARY_DEBUG NAMES "protobufd" PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH NO_CMAKE_FIND_ROOT_PATH) @@ -149,6 +154,63 @@ if(BUILD_TF_PARSER) include_directories(SYSTEM "${TF_GENERATED_SOURCES}") endif() +if(BUILD_ONNX_PARSER) + add_definitions(-DARMNN_ONNX_PARSER) + + find_path(ONNX_GENERATED_SOURCES "onnx/onnx.pb.cc") + + # C++ headers generated for onnx protobufs + include_directories(SYSTEM "${ONNX_GENERATED_SOURCES}") +endif() + + +# Flatbuffers support for TF Lite +if(BUILD_TF_LITE_PARSER) + find_path(TF_LITE_SCHEMA_INCLUDE_PATH + schema_generated.h + HINTS ${TF_LITE_GENERATED_PATH}) + + if(NOT TF_LITE_SCHEMA_INCLUDE_PATH) + message(WARNING + "Couldn't find 'schema_generated.h' at ${TF_LITE_GENERATED_PATH}. Disabling Tf Lite support") + set(BUILD_TF_LITE_PARSER Off) + else() + message(STATUS "Tf Lite generated header found at: ${TF_LITE_SCHEMA_INCLUDE_PATH}") + endif() + + # verify we have a valid flatbuffers include path + find_path(FLATBUFFERS_INCLUDE_PATH flatbuffers/flatbuffers.h + HINTS ${FLATBUFFERS_ROOT}/include /usr/local/include /usr/include) + + if(NOT FLATBUFFERS_INCLUDE_PATH) + message(WARNING + "Couldn't find 'flatbuffers/flatbuffers.h' at ${FLATBUFFERS_ROOT}/include. Disabling Tf Lite support") + set(BUILD_TF_LITE_PARSER Off) + else() + message(STATUS "Flatbuffers headers are located at: ${FLATBUFFERS_INCLUDE_PATH}") + endif() + + find_library(FLATBUFFERS_LIBRARY + NAMES libflatbuffers.a flatbuffers + HINTS ${FLATBUFFERS_ROOT}/lib /usr/local/lib /usr/lib) + + if(NOT FLATBUFFERS_LIBRARY) + message(WARNING + "Couldn't find flatbuffers library. Disabling Tf Lite support") + set(BUILD_TF_LITE_PARSER Off) + else() + message(STATUS "Flatbuffers library located at: ${FLATBUFFERS_LIBRARY}") + endif() + + # Setup includes and libs only if we still want Tf Lite + if(BUILD_TF_LITE_PARSER) + include_directories(SYSTEM "${TF_LITE_SCHEMA_INCLUDE_PATH}") + include_directories(SYSTEM "${FLATBUFFERS_INCLUDE_PATH}") + add_definitions(-DARMNN_TF_LITE_PARSER) + add_definitions(-DARMNN_TF_LITE_SCHEMA_PATH="${TF_LITE_SCHEMA_INCLUDE_PATH}/schema.fbs") + endif() +endif() + include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) # ARM Compute @@ -238,12 +300,7 @@ if(ARMCOMPUTENEON OR ARMCOMPUTECL) find_path(HALF_INCLUDE half/half.hpp PATHS ${ARMCOMPUTE_ROOT}/include NO_DEFAULT_PATH NO_CMAKE_FIND_ROOT_PATH) - include_directories(${HALF_INCLUDE}) -endif() - -# Built-in profiler -if(PROFILING) - add_definitions(-DARMNN_PROFILING_ENABLED) + include_directories(SYSTEM ${HALF_INCLUDE}) endif() # Streamline annotate @@ -252,7 +309,7 @@ if(PROFILING_BACKEND_STREAMLINE) add_definitions(-DARMNN_STREAMLINE_ENABLED) endif() -if(HEAP_PROFILING) +if(HEAP_PROFILING OR LEAK_CHECKING) # enable heap profiling for everything except for referencetests if(NOT ${PROJECT_NAME} STREQUAL "referencetests") find_path(HEAP_PROFILER_INCLUDE gperftools/heap-profiler.h @@ -265,9 +322,14 @@ if(HEAP_PROFILING) link_directories(${GPERFTOOLS_ROOT}/lib) link_libraries(${GPERF_TOOLS_LIBRARY}) - add_definitions("-DARMNN_HEAP_PROFILING_ENABLED=1") + if (HEAP_PROFILING) + add_definitions("-DARMNN_HEAP_PROFILING_ENABLED=1") + endif() + if (LEAK_CHECKING) + add_definitions("-DARMNN_LEAK_CHECKING_ENABLED=1") + endif() else() - message("Heap profiling is disabled for referencetests") + message("Heap profiling and leak checking are disabled for referencetests") endif() else() # Valgrind only works with gperftools version number <= 2.4 @@ -283,3 +345,6 @@ if(NOT BUILD_TF_PARSER) message(STATUS "Tensorflow parser support is disabled") endif() +if(NOT BUILD_TF_LITE_PARSER) + message(STATUS "Tensorflow Lite parser support is disabled") +endif() diff --git a/include/armnn/ArmNN.hpp b/include/armnn/ArmNN.hpp index d1cb7a8488..66697c428b 100644 --- a/include/armnn/ArmNN.hpp +++ b/include/armnn/ArmNN.hpp @@ -9,6 +9,7 @@ #include "IRuntime.hpp" #include "INetwork.hpp" #include "LayerSupport.hpp" +#include "LstmParams.hpp" #include "Tensor.hpp" #include "Types.hpp" #include "TypesUtils.hpp" diff --git a/include/armnn/Descriptors.hpp b/include/armnn/Descriptors.hpp index 2595656c70..3cf152befe 100644 --- a/include/armnn/Descriptors.hpp +++ b/include/armnn/Descriptors.hpp @@ -95,8 +95,8 @@ private: uint32_t** m_ViewSizes; }; -// Convenience template to create a OriginsDescriptor to use when creating a Merger layer for performing concatenation -// of a number of input tensors +/// Convenience template to create an OriginsDescriptor to use when creating a Merger layer for performing concatenation +/// of a number of input tensors template OriginsDescriptor CreateMergerDescriptorForConcatenation(TensorShapeIt first, TensorShapeIt last, unsigned int concatenationDimension) @@ -301,7 +301,35 @@ struct ResizeBilinearDescriptor struct ReshapeDescriptor { + ReshapeDescriptor() + : m_TargetShape() + {} + + ReshapeDescriptor(const TensorShape& shape) + : m_TargetShape(shape) + {} + TensorShape m_TargetShape; }; +// temporary descriptor for Lstm +struct LstmDescriptor +{ + LstmDescriptor() + : m_ActivationFunc(1) // 0: None, 1: Relu, 3: Relu6, 4: Tanh, 6: Sigmoid + , m_ClippingThresCell(0.0) + , m_ClippingThresProj(0.0) + , m_CifgEnabled(true) + , m_PeepholeEnabled(false) + , m_ProjectionEnabled(false) + {} + + uint32_t m_ActivationFunc; + float m_ClippingThresCell; + float m_ClippingThresProj; + bool m_CifgEnabled; + bool m_PeepholeEnabled; + bool m_ProjectionEnabled; +}; + } diff --git a/include/armnn/DescriptorsFwd.hpp b/include/armnn/DescriptorsFwd.hpp index 58b4bcc626..8c14614876 100644 --- a/include/armnn/DescriptorsFwd.hpp +++ b/include/armnn/DescriptorsFwd.hpp @@ -12,6 +12,7 @@ struct Convolution2dDescriptor; struct DepthwiseConvolution2dDescriptor; struct FakeQuantizationDescriptor; struct FullyConnectedDescriptor; +struct LstmDescriptor; struct PermuteDescriptor; struct NormalizationDescriptor; struct Pooling2dDescriptor; diff --git a/include/armnn/Exceptions.hpp b/include/armnn/Exceptions.hpp index 630c77660d..403fc593b5 100644 --- a/include/armnn/Exceptions.hpp +++ b/include/armnn/Exceptions.hpp @@ -11,7 +11,38 @@ namespace armnn { -// base class for all ArmNN exceptions so that users can filter to just those +struct CheckLocation +{ + const char* m_Function; + const char* m_File; + unsigned int m_Line; + + CheckLocation(const char* func, + const char* file, + unsigned int line) + : m_Function{func} + , m_File{file} + , m_Line{line} + { + } + + std::string AsString() const + { + std::stringstream ss; + ss << " at function " << m_Function + << " [" << m_File << ':' << m_Line << "]"; + return ss.str(); + } + + std::string FileLine() const + { + std::stringstream ss; + ss << " [" << m_File << ':' << m_Line << "]"; + return ss.str(); + } +}; + +/// Base class for all ArmNN exceptions so that users can filter to just those. class Exception : public std::exception { public: @@ -91,4 +122,6 @@ void ConditionalThrowIfNotEqual(const std::string& message, } } -} +} // namespace armnn + +#define CHECK_LOCATION() armnn::CheckLocation(__func__, __FILE__, __LINE__) diff --git a/include/armnn/INetwork.hpp b/include/armnn/INetwork.hpp index 5cff810db5..cefcbfb06c 100644 --- a/include/armnn/INetwork.hpp +++ b/include/armnn/INetwork.hpp @@ -11,6 +11,7 @@ #include "armnn/Types.hpp" #include +#include namespace armnn { @@ -25,7 +26,8 @@ public: virtual IOutputSlot* GetConnection() = 0; protected: - ~IInputSlot() {} /// Not user deletable + /// Not user deletable. + ~IInputSlot() {} }; /// @brief An output connection slot for a layer. @@ -45,7 +47,8 @@ public: virtual void Disconnect(IInputSlot& slot) = 0; protected: - ~IOutputSlot() {} /// Not user deletable + /// Not user deletable. + ~IOutputSlot() {} }; /// @brief Interface for a layer that is connectable to other layers via InputSlots and OutputSlots. @@ -63,9 +66,12 @@ public: virtual const IOutputSlot& GetOutputSlot(unsigned int index) const = 0; virtual IOutputSlot& GetOutputSlot(unsigned int index) = 0; + virtual std::vector InferOutputShapes(const std::vector& inputShapes) const = 0; + virtual LayerGuid GetGuid() const = 0; protected: - ~IConnectableLayer() {} // Objects are not deletable via the handle + /// Objects are not deletable via the handle + ~IConnectableLayer() {} }; using INetworkPtr = std::unique_ptr; @@ -81,19 +87,19 @@ public: virtual Status PrintGraph() = 0; - /// Add an input layer to the network. - /// @param id User generated id to uniquely identify a particular input. The same id needs to be specified + /// Adds an input layer to the network. + /// @param id - User generated id to uniquely identify a particular input. The same id needs to be specified. /// when passing the inputs to the IRuntime::EnqueueWorkload() function. - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddInputLayer(LayerBindingId id, const char* name = nullptr) = 0; - /// Add a 2D convolution layer to the network. - /// @param convolution2dDescriptor Description of the 2D convolution layer - /// @param weights Tensor for the weights data. - /// @param biases (Optional) Tensor for the bias data. Must match the output tensor shape. - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a 2D convolution layer to the network. + /// @param convolution2dDescriptor - Description of the 2D convolution layer. + /// @param weights - Tensor for the weights data. + /// @param biases - (Optional) Tensor for the bias data. Must match the output tensor shape. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddConvolution2dLayer(const Convolution2dDescriptor& convolution2dDescriptor, const ConstTensor& weights, const char* name = nullptr) = 0; @@ -103,12 +109,12 @@ public: const ConstTensor& biases, const char* name = nullptr) = 0; - /// Add a 2D depthwise convolution layer to the network. - /// @param convolution2dDescriptor Description of the 2D depthwise convolution layer - /// @param weights Tensor for the weights data. Expected format: [1, outputChannels, height, width] - /// @param biases (Optional) Tensor for the bias data. Must match the output tensor shape. - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a 2D depthwise convolution layer to the network. + /// @param convolution2dDescriptor - Description of the 2D depthwise convolution layer. + /// @param weights - Tensor for the weights data. Expected format: [1, outputChannels, height, width]. + /// @param biases (Optional) - Tensor for the bias data. Must match the output tensor shape. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddDepthwiseConvolution2dLayer( const DepthwiseConvolution2dDescriptor& convolution2dDescriptor, const ConstTensor& weights, @@ -120,12 +126,12 @@ public: const ConstTensor& biases, const char* name = nullptr) = 0; - /// Add a fully connected layer to the network. - /// @param fullyConnectedDescriptor Description of the fully connected layer - /// @param weights Tensor for the weights data. - /// @param biases (Optional) Tensor for the bias data. - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a fully connected layer to the network. + /// @param fullyConnectedDescriptor - Description of the fully connected layer. + /// @param weights - Tensor for the weights data. + /// @param biases - (Optional) Tensor for the bias data. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddFullyConnectedLayer(const FullyConnectedDescriptor& fullyConnectedDescriptor, const ConstTensor& weights, const char* name = nullptr) = 0; @@ -135,76 +141,77 @@ public: const ConstTensor& biases, const char* name = nullptr) = 0; - /// Add a permute layer to the network. - /// @param permuteDescriptor PermuteDescriptor to configure the permute - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a permute layer to the network. + /// @param permuteDescriptor - PermuteDescriptor to configure the permute. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddPermuteLayer(const PermuteDescriptor& permuteDescriptor, const char* name = nullptr) = 0; - /// Add a pooling layer to the network. - /// @param pooling2dDescriptor Pooling2dDescriptor to configure the pooling - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a pooling layer to the network. + /// @param pooling2dDescriptor - Pooling2dDescriptor to configure the pooling. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddPooling2dLayer(const Pooling2dDescriptor& pooling2dDescriptor, const char* name = nullptr) = 0; - /// Add an activation layer to the network. - /// @param activationDescriptor ActivationDescriptor to configure the activation - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds an activation layer to the network. + /// @param activationDescriptor - ActivationDescriptor to configure the activation. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddActivationLayer(const ActivationDescriptor& activationDescriptor, const char* name = nullptr) = 0; - /// Add a normalization layer to the network. - /// @param normalizationDescriptor NormalizationDescriptor to configure the normalization - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a normalization layer to the network. + /// @param normalizationDescriptor - NormalizationDescriptor to configure the normalization. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddNormalizationLayer(const NormalizationDescriptor& normalizationDescriptor, const char* name = nullptr) = 0; - /// Add a softmax layer to the network. - /// @param softmaxDescriptor SoftmaxDescriptor to configure the softmax - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a softmax layer to the network. + /// @param softmaxDescriptor - SoftmaxDescriptor to configure the softmax. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddSoftmaxLayer(const SoftmaxDescriptor& softmaxDescriptor, const char* name = nullptr) = 0; - /// Add a splitter layer to the network. - /// @param splitterDescriptor WindowsDescriptor to configure the splitting process. Number of Views must be equal to - /// the number of outputs, and their order must match - e.g. first view corresponds to - /// the first output, second view to the second output, etc.... - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a splitter layer to the network. + /// @param splitterDescriptor - WindowsDescriptor to configure the splitting process. + /// Number of Views must be equal to the number of outputs, + /// and their order must match - e.g. first view corresponds to + /// the first output, second view to the second output, etc.... + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddSplitterLayer(const ViewsDescriptor& splitterDescriptor , const char* name = nullptr) = 0; - /// Add a merger layer to the network. - /// @param mergerDescriptor WindowsDescriptor to configure the merging process. Number of Views must be equal to + /// Adds a merger layer to the network. + /// @param mergerDescriptor - WindowsDescriptor to configure the merging process. Number of Views must be equal to /// the number of inputs, and their order must match - e.g. first view corresponds to /// the first input, second view to the second input, etc.... - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddMergerLayer(const OriginsDescriptor& mergerDescriptor, const char* name = nullptr) = 0; - /// Add an addition layer to the network. - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds an addition layer to the network. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddAdditionLayer(const char* name = nullptr) = 0; - /// Add a multiplication layer to the network. - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a multiplication layer to the network. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddMultiplicationLayer(const char* name = nullptr) = 0; - /// Add a batch normalization layer to the network. - /// @param mean Pre-calculated mean for each channel - /// @param variance Pre-calculated variance for each channel - /// @param beta Per-channel additive factor - /// @param gamma Per-channel multiplicative factor - /// @return Interface for configuring the layer. - /// @param name Optional name for the layer + /// Adds a batch normalization layer to the network. + /// @param mean - Pre-calculated mean for each channel. + /// @param variance - Pre-calculated variance for each channel. + /// @param beta - Per-channel additive factor. + /// @param gamma - Per-channel multiplicative factor. + /// @return - Interface for configuring the layer. + /// @param name - Optional name for the layer. virtual IConnectableLayer* AddBatchNormalizationLayer(const BatchNormalizationDescriptor& desc, const ConstTensor& mean, const ConstTensor& variance, @@ -212,47 +219,55 @@ public: const ConstTensor& gamma, const char* name = nullptr) = 0; - /// Add a resize bilinear layer to the network. - /// @param resizeDesc Parameters for the resize operation - /// @param name Optional name for the layer - /// @return Interface for configuring the layer + /// Adds a resize bilinear layer to the network. + /// @param resizeDesc - Parameters for the resize operation. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddResizeBilinearLayer(const ResizeBilinearDescriptor& resizeDesc, const char* name = nullptr) = 0; - /// Add an L2 normalization layer to the network. + /// Adds an L2 normalization layer to the network. /// Normalization is performed along dimension 1, but requires a 4d input. - /// @param name Optional name for the layer - /// @return Interface for configuring the layer + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddL2NormalizationLayer(const char* name = nullptr) = 0; /// Adds a layer with no inputs and a single output, which always corresponds to /// the passed in constant tensor. - /// @param input Tensor to be provided as the only output of the layer. The layer will maintain its own copy of the - /// tensor data, meaning the memory referenced by @a input can be freed or reused after this function is - /// called. - /// @param name Optional name for the layer - /// @return Interface for configuring the layer + /// @param input - Tensor to be provided as the only output of the layer. The layer will maintain + /// its own copy of the tensor data, meaning the memory referenced by @a input can + /// be freed or reused after this function is called. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddConstantLayer(const ConstTensor& input, const char* name = nullptr) = 0; - /// Add a reshape layer to the network. - /// @param reshapeDescriptor Parameters for the reshape operation - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a reshape layer to the network. + /// @param reshapeDescriptor - Parameters for the reshape operation. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddReshapeLayer(const ReshapeDescriptor& reshapeDescriptor, const char* name = nullptr) = 0; - /// Add a floor layer to the network. - /// @param name Optional name for the layer - /// @return Interface for configuring the layer. + /// Adds a floor layer to the network. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. virtual IConnectableLayer* AddFloorLayer(const char* name = nullptr) = 0; - /// Add an output layer to the network. - /// @param id User generated id to uniquely identify a particular output. The same id needs to be specified + /// Adds an output layer to the network. + /// @param id - User generated id to uniquely identify a particular output. The same id needs to be specified /// when passing the outputs to the IRuntime::EnqueueWorkload() function. + /// @param name - Optional name for the layer. + /// @return - Interface for configuring the layer. + virtual IConnectableLayer* AddOutputLayer(LayerBindingId id, const char* name = nullptr) = 0; + + /// Add a Lstm layer to the network + /// @param descriptor Parameters for the Lstm operation /// @param name Optional name for the layer /// @return Interface for configuring the layer. - virtual IConnectableLayer* AddOutputLayer(LayerBindingId id, const char* name = nullptr) = 0; + virtual IConnectableLayer* AddLstmLayer(const LstmDescriptor& descriptor, + const LstmInputParams& params, + const char* name = nullptr) = 0; protected: ~INetwork() {} @@ -268,16 +283,34 @@ public: virtual Status PrintGraph() = 0; virtual Status SerializeToDot(std::ostream& stream) const = 0; + protected: ~IOptimizedNetwork() {} }; +struct OptimizerOptions +{ + OptimizerOptions() : m_ReduceFp32ToFp16(false) {} + + OptimizerOptions(bool reduceFp32ToFp16) + : m_ReduceFp32ToFp16(reduceFp32ToFp16) + { + } + + // Reduce Fp32 data to Fp16 for faster processing + bool m_ReduceFp32ToFp16; +}; /// Create an optimized version of the network /// @param network INetwork description of the network to be optimized. -/// @param deviceSpec The choice of the default computation backend. +/// @param backendPreferences The choice of the backend ordered by user preferences. +/// @param deviceSpec DeviceSpec object as queried from the runtime. See IRuntime::GetDeviceSpec() +/// @param options OptimizerOptions object with optimizer configuration options /// @return An IOptimizedNetworkPtr interface to the optimized network, throws an exception derived from /// armnn::Exception if process fails. -IOptimizedNetworkPtr Optimize(const INetwork& network, const DeviceSpec& deviceSpec); +IOptimizedNetworkPtr Optimize(const INetwork& network, + const std::vector& backendPreferences, + const IDeviceSpec& deviceSpec, + const OptimizerOptions& options = OptimizerOptions()); } //namespace armnn diff --git a/include/armnn/IProfiler.hpp b/include/armnn/IProfiler.hpp new file mode 100644 index 0000000000..a28173e5e1 --- /dev/null +++ b/include/armnn/IProfiler.hpp @@ -0,0 +1,38 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include + +namespace armnn +{ + +class IProfiler +{ +public: + /// Enables/disables profiling for this profiler. + /// @param [in] enableProfiling A flag that indicates whether profiling should be enabled or not. + virtual void EnableProfiling(bool enableProfiling) = 0; + + /// Checks whether profiling is enabled. + /// Profiling is disabled by default. + /// @return true if profiling is enabled, false otherwise. + virtual bool IsProfilingEnabled() = 0; + + /// Analyzes the tracked events and writes the results to the given output stream. + /// Please refer to the configuration variables in Profiling.cpp to customize the information written. + /// @param [out] outStream The stream where to write the profiling results to. + virtual void AnalyzeEventsAndWriteResults(std::ostream& outStream) const = 0; + + /// Print stats for events in JSON Format to the given output stream. + /// @param [out] outStream The stream where to write the profiling results to. + virtual void Print(std::ostream& outStream) const = 0; + +protected: + ~IProfiler() {} +}; + +} // namespace armnn diff --git a/include/armnn/IRuntime.hpp b/include/armnn/IRuntime.hpp index a1a3f0fda9..36efdbdcab 100644 --- a/include/armnn/IRuntime.hpp +++ b/include/armnn/IRuntime.hpp @@ -9,6 +9,7 @@ #include "Types.hpp" #include "Tensor.hpp" #include "INetwork.hpp" +#include "IProfiler.hpp" #include "TypesUtils.hpp" namespace armnn @@ -16,7 +17,7 @@ namespace armnn using NetworkId = int; -class IClTunedParameters; +class IGpuAccTunedParameters; class IRuntime; using IRuntimePtr = std::unique_ptr; @@ -26,66 +27,80 @@ class IRuntime public: struct CreationOptions { - Compute m_DefaultComputeDevice; - bool m_UseCpuRefAsFallback; - /// If set, uses the CL tuned parameters from the given object when executing CL workloads. + CreationOptions() + : m_GpuAccTunedParameters(nullptr) + , m_EnableGpuProfiling(false) + {} + + /// If set, uses the GpuAcc tuned parameters from the given object when executing GPU workloads. /// It will also be updated with new tuned parameters if it is configured to do so. - IClTunedParameters* m_ClTunedParameters; - - CreationOptions(Compute defaultComputeDevice) - : m_DefaultComputeDevice(defaultComputeDevice) - , m_UseCpuRefAsFallback(true) - , m_ClTunedParameters(nullptr) - { - } + std::shared_ptr m_GpuAccTunedParameters; + + // Setting this flag will allow the user to obtain GPU profiling information from the runtime. + bool m_EnableGpuProfiling; }; static IRuntime* CreateRaw(const CreationOptions& options); static IRuntimePtr Create(const CreationOptions& options); static void Destroy(IRuntime* runtime); + /// Loads a complete network into the IRuntime. + /// @param [out] networkIdOut - Unique identifier for the network is returned in this reference. + /// @param [in] network - Complete network to load into the IRuntime. + /// The runtime takes ownership of the network once passed in. + /// @return armnn::Status + virtual Status LoadNetwork(NetworkId& networkIdOut, IOptimizedNetworkPtr network) = 0; + /// Load a complete network into the IRuntime. /// @param [out] networkIdOut Unique identifier for the network is returned in this reference. /// @param [in] network Complete network to load into the IRuntime. + /// @param [out] errorMessage Error message if there were any errors. /// The runtime takes ownership of the network once passed in. /// @return armnn::Status - virtual Status LoadNetwork(NetworkId& networkIdOut, IOptimizedNetworkPtr network) = 0; + virtual Status LoadNetwork(NetworkId& networkIdOut, + IOptimizedNetworkPtr network, + std::string & errorMessage) = 0; virtual TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const = 0; virtual TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const = 0; - // Evaluate network using input in inputTensors, outputs filled into outputTensors + /// Evaluates a network using input in inputTensors and outputs filled into outputTensors virtual Status EnqueueWorkload(NetworkId networkId, - const InputTensors& inputTensors, - const OutputTensors& outputTensors) = 0; + const InputTensors& inputTensors, + const OutputTensors& outputTensors) = 0; - /// Unload a network from the IRuntime. + /// Unloads a network from the IRuntime. /// At the moment this only removes the network from the m_Impl->m_Network. /// This might need more work in the future to be AndroidNN compliant. - /// @param [in] networkId Unique identifier for the network to be unloaded. Generated in LoadNetwork(). + /// @param [in] networkId - Unique identifier for the network to be unloaded. Generated in LoadNetwork(). /// @return armnn::Status virtual Status UnloadNetwork(NetworkId networkId) = 0; - virtual const DeviceSpec& GetDeviceSpec() const = 0; + virtual const IDeviceSpec& GetDeviceSpec() const = 0; + + /// Gets the profiler corresponding to the given network id. + /// @param networkId The id of the network for which to get the profile. + /// @return A pointer to the requested profiler, or nullptr if not found. + virtual const std::shared_ptr GetProfiler(NetworkId networkId) const = 0; protected: ~IRuntime() {} }; -using IClTunedParametersPtr = std::unique_ptr; +using IGpuAccTunedParametersPtr = std::shared_ptr; -/// Manages a set of Open CL parameters which have been tuned for maximum performance. -/// Pass an instance of this object to the IRuntime::Create() method (via IRuntime::CreationOptions) to use it -/// for all CL workload execution. +/// Manages a set of GpuAcc parameters which have been tuned for maximum performance. +/// Passes an instance of this object to the IRuntime::Create() method (via IRuntime::CreationOptions) to use it +/// for all GPU workload execution. /// /// Can be created in two modes: -/// - In UseTunedParameters mode the parameters stored in this object are used to execute CL workloads. -/// - In UpdateTunedParameters mode, additionally, whenever a CL workload is executed for the first time the +/// - In UseTunedParameters mode, the parameters stored in this object are used to execute GPU workloads. +/// - In UpdateTunedParameters mode, additionally, whenever a GPU workload is executed for the first time, the /// optimum parameters will be found and stored in this object. WARNING - This tuning can be slow. /// -/// The parameters can be loaded from and saved to a file so that you first run a slow initial read-write +/// The parameters can be loaded from and saved to a file so that you can first run a slow initial read-write /// execution, save the parameters for later and then run fast read-only executions using the optimised parameters. -class IClTunedParameters +class IGpuAccTunedParameters { public: enum class Mode @@ -96,10 +111,10 @@ public: /// Creates an IClTunedParameters with the given mode. /// @{ - static IClTunedParameters* CreateRaw(Mode mode); - static IClTunedParametersPtr Create(Mode mode); + static IGpuAccTunedParameters* CreateRaw(Mode mode); + static IGpuAccTunedParametersPtr Create(Mode mode); /// @} - static void Destroy(IClTunedParameters* params); + static void Destroy(IGpuAccTunedParameters* params); /// Loads an existing set of tuned parameters from the given file. /// If there is an error loading the file, an armnn::Exception is thrown. @@ -110,7 +125,7 @@ public: virtual void Save(const char* filename) const = 0; protected: - virtual ~IClTunedParameters() {}; + virtual ~IGpuAccTunedParameters() {}; }; } diff --git a/include/armnn/LayerSupport.hpp b/include/armnn/LayerSupport.hpp index 43a5756e4a..c875619949 100644 --- a/include/armnn/LayerSupport.hpp +++ b/include/armnn/LayerSupport.hpp @@ -13,6 +13,7 @@ namespace armnn bool IsActivationSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, char* reasonIfUnsupported = nullptr, size_t reasonIfUnsupportedMaxLength = 1024); @@ -26,6 +27,11 @@ bool IsAdditionSupported(Compute compute, bool IsBatchNormalizationSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, char* reasonIfUnsupported = nullptr, size_t reasonIfUnsupportedMaxLength = 1024); @@ -35,6 +41,18 @@ bool IsConstantSupported(Compute compute, char* reasonIfUnsupported = nullptr, size_t reasonIfUnsupportedMaxLength = 1024); +bool IsConvertFp16ToFp32Supported(Compute compute, + const TensorInfo& input, + const TensorInfo& output, + char* reasonIfUnsupported = nullptr, + size_t reasonIfUnsupportedMaxLength = 1024); + +bool IsConvertFp32ToFp16Supported(Compute compute, + const TensorInfo& input, + const TensorInfo& output, + char* reasonIfUnsupported = nullptr, + size_t reasonIfUnsupportedMaxLength = 1024); + bool IsConvolution2dSupported(Compute compute, const TensorInfo& input, const TensorInfo& output, @@ -46,8 +64,10 @@ bool IsConvolution2dSupported(Compute compute, bool IsDepthwiseConvolutionSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, char* reasonIfUnsupported = nullptr, size_t reasonIfUnsupportedMaxLength = 1024); @@ -57,16 +77,35 @@ bool IsInputSupported(Compute compute, size_t reasonIfUnsupportedMaxLength = 1024); bool IsFullyConnectedSupported(Compute compute, - const TensorInfo& input,const - FullyConnectedDescriptor& descriptor, + const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor, char* reasonIfUnsupported = nullptr, size_t reasonIfUnsupportedMaxLength = 1024); bool IsL2NormalizationSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, char* reasonIfUnsupported = nullptr, size_t reasonIfUnsupportedMaxLength = 1024); +bool IsLstmSupported(Compute compute, const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, char* reasonIfUnsupported = nullptr, + size_t reasonIfUnsupportedMaxLength = 1024); + bool IsMergerSupported(Compute compute, const std::vector inputs, const OriginsDescriptor& descriptor, @@ -76,6 +115,7 @@ bool IsMergerSupported(Compute compute, bool IsMultiplicationSupported(Compute compute, const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, char* reasonIfUnsupported = nullptr, size_t reasonIfUnsupportedMaxLength = 1024); @@ -112,6 +152,7 @@ bool IsResizeBilinearSupported(Compute compute, bool IsSoftmaxSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, char* reasonIfUnsupported = nullptr, size_t reasonIfUnsupportedMaxLength = 1024); diff --git a/include/armnn/LstmParams.hpp b/include/armnn/LstmParams.hpp new file mode 100644 index 0000000000..cfca0df5bb --- /dev/null +++ b/include/armnn/LstmParams.hpp @@ -0,0 +1,55 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "TensorFwd.hpp" + +namespace armnn +{ + +struct LstmInputParams +{ + LstmInputParams() + : m_InputToInputWeights(nullptr) + , m_InputToForgetWeights(nullptr) + , m_InputToCellWeights(nullptr) + , m_InputToOutputWeights(nullptr) + , m_RecurrentToInputWeights(nullptr) + , m_RecurrentToForgetWeights(nullptr) + , m_RecurrentToCellWeights(nullptr) + , m_RecurrentToOutputWeights(nullptr) + , m_CellToInputWeights(nullptr) + , m_CellToForgetWeights(nullptr) + , m_CellToOutputWeights(nullptr) + , m_InputGateBias(nullptr) + , m_ForgetGateBias(nullptr) + , m_CellBias(nullptr) + , m_OutputGateBias(nullptr) + , m_ProjectionWeights(nullptr) + , m_ProjectionBias(nullptr) + { + } + + const ConstTensor* m_InputToInputWeights; + const ConstTensor* m_InputToForgetWeights; + const ConstTensor* m_InputToCellWeights; + const ConstTensor* m_InputToOutputWeights; + const ConstTensor* m_RecurrentToInputWeights; + const ConstTensor* m_RecurrentToForgetWeights; + const ConstTensor* m_RecurrentToCellWeights; + const ConstTensor* m_RecurrentToOutputWeights; + const ConstTensor* m_CellToInputWeights; + const ConstTensor* m_CellToForgetWeights; + const ConstTensor* m_CellToOutputWeights; + const ConstTensor* m_InputGateBias; + const ConstTensor* m_ForgetGateBias; + const ConstTensor* m_CellBias; + const ConstTensor* m_OutputGateBias; + const ConstTensor* m_ProjectionWeights; + const ConstTensor* m_ProjectionBias; +}; + +} // namespace armnn + diff --git a/include/armnn/NetworkFwd.hpp b/include/armnn/NetworkFwd.hpp index 75667fdfd0..56aedaf8d4 100644 --- a/include/armnn/NetworkFwd.hpp +++ b/include/armnn/NetworkFwd.hpp @@ -6,6 +6,7 @@ namespace armnn { +struct LstmInputParams; class INetwork; class IOptimizedNetwork; class Graph; @@ -13,4 +14,4 @@ class IInputSlot; class IOutputSlot; class IConnectableLayer; class IDataLayer; -} \ No newline at end of file +} diff --git a/include/armnn/Tensor.hpp b/include/armnn/Tensor.hpp index 910278f33f..718dd817c5 100644 --- a/include/armnn/Tensor.hpp +++ b/include/armnn/Tensor.hpp @@ -18,7 +18,7 @@ namespace armnn class TensorShape { public: - /// Empty (invalid) constructor + /// Empty (invalid) constructor. TensorShape(); TensorShape(unsigned int numDimensions, const unsigned int* dimensionSizes); @@ -53,7 +53,7 @@ private: class TensorInfo { public: - /// Empty (invalid) constructor + /// Empty (invalid) constructor. TensorInfo(); TensorInfo(const TensorShape& shape, DataType dataType, @@ -88,7 +88,7 @@ public: private: TensorShape m_Shape; DataType m_DataType; - /// Scale and offset values used for quantization + /// Scale and offset values are used for quantization. struct Quantization { Quantization() : m_Scale(0.f), m_Offset(0) {} @@ -102,11 +102,11 @@ template class BaseTensor { public: - /// Empty (invalid) constructor + /// Empty (invalid) constructor. BaseTensor(); /// Constructor from a raw memory pointer. - /// @param memoryArea Region of CPU-addressable memory where tensor data will be stored. Must be valid while + /// @param memoryArea - Region of CPU-addressable memory where tensor data will be stored. Must be valid while /// workloads are on the fly. Tensor instances do not claim ownership of referenced memory regions, that is, /// no attempt will be made by ArmNN to free these memory regions automatically. BaseTensor(const TensorInfo& info, MemoryType memoryArea); @@ -130,7 +130,7 @@ public: MemoryType GetMemoryArea() const { return m_MemoryArea; } protected: - // protected destructor to stop users from making these + // Protected destructor to stop users from making these // (could still new one on the heap and then leak it...) ~BaseTensor() {} @@ -144,21 +144,23 @@ private: class Tensor : public BaseTensor { public: - using BaseTensor::BaseTensor; // Bring in the constructors and assignment operator + /// Brings in the constructors and assignment operator. + using BaseTensor::BaseTensor; }; /// A tensor defined by a TensorInfo (shape and data type) and an immutable backing store. class ConstTensor : public BaseTensor { public: - using BaseTensor::BaseTensor; // Bring in the constructors and assignment operator + /// Brings in the constructors and assignment operator. + using BaseTensor::BaseTensor; ConstTensor() : BaseTensor() {} // This needs to be redefined explicitly?? - // Can be implicitly constructed from non-const Tensor + /// Can be implicitly constructed from non-const Tensor. ConstTensor(const Tensor& other) : BaseTensor(other.GetInfo(), other.GetMemoryArea()) {} /// Constructor from a backing container. - /// @param container An stl-like container type which implements data() and size() methods. + /// @param container - An stl-like container type which implements data() and size() methods. /// Presence of data() and size() is a strong indicator of the continuous memory layout of the container, /// which is a requirement for Tensor data. Tensor instances do not claim ownership of referenced memory regions, /// that is, no attempt will be made by ArmNN to free these memory regions automatically. diff --git a/include/armnn/Types.hpp b/include/armnn/Types.hpp index c9a4bf13e5..fe1fcb45d2 100644 --- a/include/armnn/Types.hpp +++ b/include/armnn/Types.hpp @@ -22,9 +22,10 @@ enum class Status enum class DataType { - Float32 = 0, - QuantisedAsymm8 = 1, - Signed32 = 2 + Float16 = 0, + Float32 = 1, + QuantisedAsymm8 = 2, + Signed32 = 3 }; enum class ActivationFunction @@ -33,7 +34,7 @@ enum class ActivationFunction TanH = 1, Linear = 2, ReLu = 3, - BoundedReLu = 4, //< min(a, max(b, input)) + BoundedReLu = 4, ///< min(a, max(b, input)) SoftReLu = 5, LeakyReLu = 6, Abs = 7, @@ -51,16 +52,18 @@ enum class PoolingAlgorithm /// /// The padding method modifies the output of pooling layers. /// In both supported methods, the values are ignored (they are -/// not even zeros which would make a difference for max pooling +/// not even zeroes, which would make a difference for max pooling /// a tensor with negative values). The difference between -/// IgnoreValue and Exclude is that the former count the padding +/// IgnoreValue and Exclude is that the former counts the padding /// fields in the divisor of Average and L2 pooling, while /// Exclude does not. /// enum class PaddingMethod { - IgnoreValue = 0, // The padding fields count, but ignored - Exclude = 1 // The padding fields don't count and ignored + /// The padding fields count, but are ignored + IgnoreValue = 0, + /// The padding fields don't count and are ignored + Exclude = 1 }; enum class NormalizationAlgorithmChannel @@ -71,8 +74,10 @@ enum class NormalizationAlgorithmChannel enum class NormalizationAlgorithmMethod { - LocalBrightness = 0, /* Krichevsky 2012: Local Brightness Normalization */ - LocalContrast = 1 /* Jarret 2009: Local Contrast Normalization */ + /// Krichevsky 2012: Local Brightness Normalization + LocalBrightness = 0, + /// Jarret 2009: Local Contrast Normalization + LocalContrast = 1 }; enum class OutputShapeRounding @@ -83,15 +88,20 @@ enum class OutputShapeRounding enum class Compute { - CpuRef = 0, // CPU Execution: Reference C++ kernels - CpuAcc = 1, // CPU Execution: NEON: ArmCompute - GpuAcc = 2, // GPU Execution: OpenCL: ArmCompute + /// CPU Execution: Reference C++ kernels + CpuRef = 0, + /// CPU Execution: NEON: ArmCompute + CpuAcc = 1, + /// GPU Execution: OpenCL: ArmCompute + GpuAcc = 2, Undefined = 5 }; -struct DeviceSpec +class IDeviceSpec { - Compute DefaultComputeDevice; +protected: + IDeviceSpec() {}; + virtual ~IDeviceSpec() {}; }; /// Type of identifiers for bindable layers (inputs, outputs). @@ -105,10 +115,10 @@ public: using ArrayType = std::array; using ConstIterator = typename ArrayType::const_iterator; - /// @param dimMappings Indicates how to translate tensor elements from a given source into the target destination, + /// @param dimMappings - Indicates how to translate tensor elements from a given source into the target destination, /// when source and target potentially have different memory layouts. /// - /// E.g. For a 4-d tensor laid out in memory with format (Batch Element, Height, Width, Channels), + /// E.g. For a 4-d tensor laid out in a memory with the format (Batch Element, Height, Width, Channels), /// which is to be passed as an input to ArmNN, each source dimension is mapped to the corresponding /// ArmNN dimension. The Batch dimension remains the same (0 -> 0). The source Height dimension is mapped /// to the location of the ArmNN Height dimension (1 -> 2). Similar arguments are made for the Width and @@ -152,7 +162,7 @@ private: SizeType m_NumDimMappings; }; -// Define LayerGuid type. +/// Define LayerGuid type. using LayerGuid = unsigned int; } diff --git a/include/armnn/TypesUtils.hpp b/include/armnn/TypesUtils.hpp index c63b653ae3..3077ce111f 100644 --- a/include/armnn/TypesUtils.hpp +++ b/include/armnn/TypesUtils.hpp @@ -10,6 +10,7 @@ #include #include #include +#include namespace armnn { @@ -89,8 +90,9 @@ constexpr unsigned int GetDataTypeSize(DataType dataType) { switch (dataType) { - case DataType::Signed32: - case DataType::Float32: return 4U; + case DataType::Float16: return 2U; + case DataType::Float32: + case DataType::Signed32: return 4U; case DataType::QuantisedAsymm8: return 1U; default: return 0U; } @@ -107,17 +109,17 @@ constexpr bool StrEqual(const char* strA, const char (&strB)[N]) return isEqual; } -constexpr Compute ParseComputeDevice(const char* str) +constexpr armnn::Compute ParseComputeDevice(const char* str) { - if (StrEqual(str, "CpuAcc")) + if (armnn::StrEqual(str, "CpuAcc")) { return armnn::Compute::CpuAcc; } - else if (StrEqual(str, "CpuRef")) + else if (armnn::StrEqual(str, "CpuRef")) { return armnn::Compute::CpuRef; } - else if (StrEqual(str, "GpuAcc")) + else if (armnn::StrEqual(str, "GpuAcc")) { return armnn::Compute::GpuAcc; } @@ -131,59 +133,60 @@ constexpr const char* GetDataTypeName(DataType dataType) { switch (dataType) { - case DataType::Float32: return "Float32"; + case DataType::Float16: return "Float16"; + case DataType::Float32: return "Float32"; case DataType::QuantisedAsymm8: return "Unsigned8"; - case DataType::Signed32: return "Signed32"; - default: return "Unknown"; + case DataType::Signed32: return "Signed32"; + + default: + return "Unknown"; } } -template -constexpr DataType GetDataType(); - -template <> -constexpr DataType GetDataType() -{ - return DataType::Float32; -} -template <> -constexpr DataType GetDataType() -{ - return DataType::QuantisedAsymm8; -} +template +struct IsHalfType + : std::integral_constant::value && sizeof(T) == 2> +{}; -template <> -constexpr DataType GetDataType() -{ - return DataType::Signed32; -} +template +struct GetDataTypeImpl; template -constexpr bool IsQuantizedType() +struct GetDataTypeImpl::value, T>> { - return std::is_integral::value; -} - + static constexpr DataType Value = DataType::Float16; +}; -template -struct ResolveTypeImpl; +template<> +struct GetDataTypeImpl +{ + static constexpr DataType Value = DataType::Float32; +}; template<> -struct ResolveTypeImpl +struct GetDataTypeImpl { - using Type = uint8_t; + static constexpr DataType Value = DataType::QuantisedAsymm8; }; template<> -struct ResolveTypeImpl +struct GetDataTypeImpl { - using Type = float; + static constexpr DataType Value = DataType::Signed32; }; -template -using ResolveType = typename ResolveTypeImpl
::Type; +template +constexpr DataType GetDataType() +{ + return GetDataTypeImpl::Value; +} +template +constexpr bool IsQuantizedType() +{ + return std::is_integral::value; +} inline std::ostream& operator<<(std::ostream& os, Status stat) { @@ -191,7 +194,23 @@ inline std::ostream& operator<<(std::ostream& os, Status stat) return os; } -inline std::ostream& operator<<(std::ostream& os, Compute compute) +inline std::ostream& operator<<(std::ostream& os, const std::vector& compute) +{ + for (const Compute& comp : compute) { + os << GetComputeDeviceAsCString(comp) << " "; + } + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const std::set& compute) +{ + for (const Compute& comp : compute) { + os << GetComputeDeviceAsCString(comp) << " "; + } + return os; +} + +inline std::ostream& operator<<(std::ostream& os, const Compute& compute) { os << GetComputeDeviceAsCString(compute); return os; @@ -212,11 +231,11 @@ inline std::ostream & operator<<(std::ostream & os, const armnn::TensorShape & s return os; } -/// Quantize a floating point data type into an 8-bit data type -/// @param value The value to quantize -/// @param scale The scale (must be non-zero) -/// @param offset The offset -/// @return The quantized value calculated as round(value/scale)+offset +/// Quantize a floating point data type into an 8-bit data type. +/// @param value - The value to quantize. +/// @param scale - The scale (must be non-zero). +/// @param offset - The offset. +/// @return - The quantized value calculated as round(value/scale)+offset. /// template inline QuantizedType Quantize(float value, float scale, int32_t offset) @@ -234,11 +253,11 @@ inline QuantizedType Quantize(float value, float scale, int32_t offset) return quantizedBits; } -/// Dequantize an 8-bit data type into a floating point data type -/// @param value The value to dequantize -/// @param scale The scale (must be non-zero) -/// @param offset The offset -/// @return The dequantized value calculated as (value-offset)*scale +/// Dequantize an 8-bit data type into a floating point data type. +/// @param value - The value to dequantize. +/// @param scale - The scale (must be non-zero). +/// @param offset - The offset. +/// @return - The dequantized value calculated as (value-offset)*scale. /// template inline float Dequantize(QuantizedType value, float scale, int32_t offset) @@ -249,4 +268,18 @@ inline float Dequantize(QuantizedType value, float scale, int32_t offset) return dequantized; } +template +void VerifyTensorInfoDataType(const armnn::TensorInfo & info) +{ + auto expectedType = armnn::GetDataType(); + if (info.GetDataType() != expectedType) + { + std::stringstream ss; + ss << "Unexpected datatype:" << armnn::GetDataTypeName(info.GetDataType()) + << " for tensor:" << info.GetShape() + << ". The type expected to be: " << armnn::GetDataTypeName(expectedType); + throw armnn::Exception(ss.str()); + } +} + } //namespace armnn diff --git a/include/armnn/Utils.hpp b/include/armnn/Utils.hpp index 1a0c34baad..4b5cb9892d 100644 --- a/include/armnn/Utils.hpp +++ b/include/armnn/Utils.hpp @@ -4,6 +4,9 @@ // #pragma once +#include +#include "armnn/TypesUtils.hpp" + namespace armnn { @@ -24,4 +27,4 @@ enum class LogSeverity /// severity: All log messages that are at this severity level or higher will be printed, others will be ignored. void ConfigureLogging(bool printToStandardOutput, bool printToDebugOutput, LogSeverity severity); -} +} // namespace armnn diff --git a/include/armnn/Version.hpp b/include/armnn/Version.hpp index d5f794eb8b..1a290d7177 100644 --- a/include/armnn/Version.hpp +++ b/include/armnn/Version.hpp @@ -9,4 +9,4 @@ // YYYY = 4-digit year number // MM = 2-digit month number // PP = 2-digit patch number -#define ARMNN_VERSION "20180502" +#define ARMNN_VERSION "20180800" diff --git a/include/armnnCaffeParser/ICaffeParser.hpp b/include/armnnCaffeParser/ICaffeParser.hpp index 55fc85052b..0f23a658b2 100644 --- a/include/armnnCaffeParser/ICaffeParser.hpp +++ b/include/armnnCaffeParser/ICaffeParser.hpp @@ -28,28 +28,28 @@ public: static ICaffeParserPtr Create(); static void Destroy(ICaffeParser* parser); - /// Create the network from a protobuf text file on disk + /// Create the network from a protobuf text file on the disk. virtual armnn::INetworkPtr CreateNetworkFromTextFile( const char* graphFile, const std::map& inputShapes, const std::vector& requestedOutputs) = 0; - /// Create the network from a protobuf binary file on disk + /// Create the network from a protobuf binary file on the disk. virtual armnn::INetworkPtr CreateNetworkFromBinaryFile( const char* graphFile, const std::map& inputShapes, const std::vector& requestedOutputs) = 0; - /// Create the network directly from protobuf text in a string. Useful for debugging/testing + /// Create the network directly from protobuf text in a string. Useful for debugging/testin.g virtual armnn::INetworkPtr CreateNetworkFromString( const char* protoText, const std::map& inputShapes, const std::vector& requestedOutputs) = 0; - /// Retrieve binding info (layer id and tensor info) for the network input identified by the given layer name + /// Retrieve binding info (layer id and tensor info) for the network input identified by the given layer name. virtual BindingPointInfo GetNetworkInputBindingInfo(const std::string& name) const = 0; - /// Retrieve binding info (layer id and tensor info) for the network output identified by the given layer name + /// Retrieve binding info (layer id and tensor info) for the network output identified by the given layer name. virtual BindingPointInfo GetNetworkOutputBindingInfo(const std::string& name) const = 0; protected: diff --git a/include/armnnOnnxParser/IOnnxParser.hpp b/include/armnnOnnxParser/IOnnxParser.hpp new file mode 100644 index 0000000000..c7ec41ec84 --- /dev/null +++ b/include/armnnOnnxParser/IOnnxParser.hpp @@ -0,0 +1,48 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include +#include + +#include +#include +#include + +namespace armnnOnnxParser +{ + +using BindingPointInfo = std::pair; + +class IOnnxParser; +using IOnnxParserPtr = std::unique_ptr; + +class IOnnxParser +{ +public: + static IOnnxParser* CreateRaw(); + static IOnnxParserPtr Create(); + static void Destroy(IOnnxParser* parser); + + /// Create the network from a protobuf binary file on disk + virtual armnn::INetworkPtr CreateNetworkFromBinaryFile(const char* graphFile) = 0; + + /// Create the network from a protobuf text file on disk + virtual armnn::INetworkPtr CreateNetworkFromTextFile(const char* graphFile) = 0; + + /// Create the network directly from protobuf text in a string. Useful for debugging/testing + virtual armnn::INetworkPtr CreateNetworkFromString(const std::string& protoText) = 0; + + /// Retrieve binding info (layer id and tensor info) for the network input identified by the given layer name + virtual BindingPointInfo GetNetworkInputBindingInfo(const std::string& name) const = 0; + + /// Retrieve binding info (layer id and tensor info) for the network output identified by the given layer name + virtual BindingPointInfo GetNetworkOutputBindingInfo(const std::string& name) const = 0; + + protected: + virtual ~IOnnxParser() {}; + }; + + } diff --git a/include/armnnTfLiteParser/ITfLiteParser.hpp b/include/armnnTfLiteParser/ITfLiteParser.hpp new file mode 100644 index 0000000000..a4f5e21327 --- /dev/null +++ b/include/armnnTfLiteParser/ITfLiteParser.hpp @@ -0,0 +1,61 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "armnn/Types.hpp" +#include "armnn/NetworkFwd.hpp" +#include "armnn/Tensor.hpp" +#include "armnn/INetwork.hpp" + +#include +#include +#include + +namespace armnnTfLiteParser +{ + +// TODO: revise this: do we really need this for every parser??? +using BindingPointInfo = std::pair; + +class ITfLiteParser; +using ITfLiteParserPtr = std::unique_ptr; + +class ITfLiteParser +{ +public: + static ITfLiteParser* CreateRaw(); + static ITfLiteParserPtr Create(); + static void Destroy(ITfLiteParser* parser); + + /// Create the network from a flatbuffers binary file on disk + virtual armnn::INetworkPtr CreateNetworkFromBinaryFile(const char* graphFile) = 0; + + /// Create the network from a flatbuffers binary + virtual armnn::INetworkPtr CreateNetworkFromBinary(const std::vector & binaryContent) = 0; + + /// Retrieve binding info (layer id and tensor info) for the network input identified by + /// the given layer name and subgraph id + virtual BindingPointInfo GetNetworkInputBindingInfo(size_t subgraphId, + const std::string& name) const = 0; + + /// Retrieve binding info (layer id and tensor info) for the network output identified by + /// the given layer name and subgraph id + virtual BindingPointInfo GetNetworkOutputBindingInfo(size_t subgraphId, + const std::string& name) const = 0; + + /// Return the number of subgraphs in the parsed model + virtual size_t GetSubgraphCount() const = 0; + + /// Return the input tensor names for a given subgraph + virtual std::vector GetSubgraphInputTensorNames(size_t subgraphId) const = 0; + + /// Return the output tensor names for a given subgraph + virtual std::vector GetSubgraphOutputTensorNames(size_t subgraphId) const = 0; + +protected: + virtual ~ITfLiteParser() {}; +}; + +} diff --git a/include/armnnTfParser/ITfParser.hpp b/include/armnnTfParser/ITfParser.hpp index a6f56c8a19..ab480b83e0 100644 --- a/include/armnnTfParser/ITfParser.hpp +++ b/include/armnnTfParser/ITfParser.hpp @@ -21,7 +21,7 @@ using BindingPointInfo = std::pair; class ITfParser; using ITfParserPtr = std::unique_ptr; -/// parses a directed acyclic graph from a tensorflow protobuf file +/// Parses a directed acyclic graph from a tensorflow protobuf file. class ITfParser { public: @@ -29,28 +29,28 @@ public: static ITfParserPtr Create(); static void Destroy(ITfParser* parser); - /// Create the network from a protobuf text file on disk + /// Create the network from a protobuf text file on the disk. virtual armnn::INetworkPtr CreateNetworkFromTextFile( const char* graphFile, const std::map& inputShapes, const std::vector& requestedOutputs) = 0; - /// Create the network from a protobuf binary file on disk + /// Create the network from a protobuf binary file on the disk. virtual armnn::INetworkPtr CreateNetworkFromBinaryFile( const char* graphFile, const std::map& inputShapes, const std::vector& requestedOutputs) = 0; - /// Create the network directly from protobuf text in a string. Useful for debugging/testing + /// Create the network directly from protobuf text in a string. Useful for debugging/testing. virtual armnn::INetworkPtr CreateNetworkFromString( const char* protoText, const std::map& inputShapes, const std::vector& requestedOutputs) = 0; - /// Retrieve binding info (layer id and tensor info) for the network input identified by the given layer name + /// Retrieve binding info (layer id and tensor info) for the network input identified by the given layer name. virtual BindingPointInfo GetNetworkInputBindingInfo(const std::string& name) const = 0; - /// Retrieve binding info (layer id and tensor info) for the network output identified by the given layer name + /// Retrieve binding info (layer id and tensor info) for the network output identified by the given layer name. virtual BindingPointInfo GetNetworkOutputBindingInfo(const std::string& name) const = 0; protected: diff --git a/samples/CMakeLists.txt b/samples/CMakeLists.txt new file mode 100644 index 0000000000..3009ac9a67 --- /dev/null +++ b/samples/CMakeLists.txt @@ -0,0 +1,4 @@ +if(BUILD_SAMPLE_APP) + add_executable(SimpleSample SimpleSample.cpp) + target_link_libraries(SimpleSample armnn pthread) +endif() diff --git a/samples/SimpleSample.cpp b/samples/SimpleSample.cpp new file mode 100644 index 0000000000..43cd93f432 --- /dev/null +++ b/samples/SimpleSample.cpp @@ -0,0 +1,68 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include +#include "armnn/ArmNN.hpp" + +/// A simple example of using the ArmNN SDK API. In this sample, the users single input number is multiplied by 1.0f +/// using a fully connected layer with a single neuron to produce an output number that is the same as the input. +int main() +{ + using namespace armnn; + + float number; + std::cout << "Please enter a number: " << std::endl; + std::cin >> number; + + // Construct ArmNN network + armnn::NetworkId networkIdentifier; + INetworkPtr myNetwork = INetwork::Create(); + + armnn::FullyConnectedDescriptor fullyConnectedDesc; + float weightsData[] = {1.0f}; // Identity + TensorInfo weightsInfo(TensorShape({1, 1}), DataType::Float32); + armnn::ConstTensor weights(weightsInfo, weightsData); + IConnectableLayer *fullyConnected = myNetwork->AddFullyConnectedLayer(fullyConnectedDesc, weights, + "fully connected"); + + IConnectableLayer *InputLayer = myNetwork->AddInputLayer(0); + IConnectableLayer *OutputLayer = myNetwork->AddOutputLayer(0); + + InputLayer->GetOutputSlot(0).Connect(fullyConnected->GetInputSlot(0)); + fullyConnected->GetOutputSlot(0).Connect(OutputLayer->GetInputSlot(0)); + + // Create ArmNN runtime + IRuntime::CreationOptions options; // default options + IRuntimePtr run = IRuntime::Create(options); + + //Set the tensors in the network. + TensorInfo inputTensorInfo(TensorShape({1, 1}), DataType::Float32); + InputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo); + + TensorInfo outputTensorInfo(TensorShape({1, 1}), DataType::Float32); + fullyConnected->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + + // Optimise ArmNN network + armnn::IOptimizedNetworkPtr optNet = Optimize(*myNetwork, {Compute::CpuRef}, run->GetDeviceSpec()); + + // Load graph into runtime + run->LoadNetwork(networkIdentifier, std::move(optNet)); + + //Creates structures for inputs and outputs. + std::vector inputData{number}; + std::vector outputData(1); + + + armnn::InputTensors inputTensors{{0, armnn::ConstTensor(run->GetInputTensorInfo(networkIdentifier, 0), + inputData.data())}}; + armnn::OutputTensors outputTensors{{0, armnn::Tensor(run->GetOutputTensorInfo(networkIdentifier, 0), + outputData.data())}}; + + // Execute network + run->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors); + + std::cout << "Your number was " << outputData[0] << std::endl; + return 0; + +} diff --git a/src/armnn/Descriptors.cpp b/src/armnn/Descriptors.cpp index be04294e85..faf167d95f 100644 --- a/src/armnn/Descriptors.cpp +++ b/src/armnn/Descriptors.cpp @@ -157,7 +157,7 @@ const uint32_t* OriginsDescriptor::GetViewOrigin(uint32_t idx) const } -// Reorder the viewOrigins in accordance with the indices presented in newOrdering array +// Reorders the viewOrigins in accordance with the indices presented in newOrdering array. void OriginsDescriptor::ReorderOrigins(unsigned int* newOrdering, unsigned int numNewOrdering) { BOOST_ASSERT_MSG(m_NumViews == numNewOrdering, "number of views must match number of " diff --git a/src/armnn/DeviceSpec.hpp b/src/armnn/DeviceSpec.hpp new file mode 100644 index 0000000000..3706438482 --- /dev/null +++ b/src/armnn/DeviceSpec.hpp @@ -0,0 +1,22 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "armnn/Types.hpp" +#include + +namespace armnn +{ + +class DeviceSpec : public IDeviceSpec +{ +public: + DeviceSpec() {} + virtual ~DeviceSpec() {} + + std::set m_SupportedComputeDevices; +}; + +} diff --git a/src/armnn/Graph.cpp b/src/armnn/Graph.cpp index 87bdc2962f..74b30e4087 100644 --- a/src/armnn/Graph.cpp +++ b/src/armnn/Graph.cpp @@ -32,7 +32,7 @@ Graph::Graph(const Graph& other) otherToClonedMap.emplace(otherLayer, layer); } - // Copy slot connections + // Copies slot connections. for (auto&& otherLayer : other.m_Layers) { Layer* const thisLayer = otherToClonedMap[otherLayer]; @@ -95,18 +95,18 @@ Status Graph::SerializeToDot(std::ostream& stream) .AddAttribute("fontname", "arial-bold"); } - // First declare the nodes + // First declares the nodes. for (auto&& layer : m_Layers) { DotNode node(stream, layer->GetGuid(), GetLayerTypeAsCString(layer->GetType())); - // Extract the layer parameters + // Extracts the layer parameters. ParameterStringifyFunction extractParams = [&node](const std::string & name, const std::string & value){ node.GetContents().AddContent(name + " : " + value); }; layer->SerializeLayerParameters(extractParams); } - // Second declare the edges + // Second declares the edges. for (auto&& layer : m_Layers) { LayerGuid toId = layer->GetGuid(); @@ -117,9 +117,9 @@ Status Graph::SerializeToDot(std::ostream& stream) LayerGuid fromId = outputSlot->GetOwningLayer().GetGuid(); DotEdge edge(stream, fromId, toId); - // Now Print the tensor shape on the edge + // Now print the tensor shape on the edge. { - // Construct the label attribute with HTML markup + // Constructs the label attribute with HTML markup. std::stringstream ss; ss << "< " << outputSlot->GetTensorInfo().GetShape() << " >"; edge.GetAttributeSet().AddAttribute("label", ss); @@ -137,13 +137,94 @@ Status Graph::SerializeToDot(std::ostream& stream) Status Graph::AllocateDynamicBuffers() { + // Layers must be sorted in topological order + BOOST_ASSERT(m_LayersInOrder); + + std::unordered_set preallocatedTensors; + std::unordered_map handleReferenceCounts; + + // Finds the first TensorHandle ancestor of a SubTensorHandle. If the ITensorHandle provided + // is a TensorHandle, the function just returns it + auto TraceSubTensorHandleAncestry = [](ITensorHandle* const subTensorHandle) + { + ITensorHandle* ancestor = subTensorHandle; + while (ancestor && ancestor->GetParent()) + { + ancestor = ancestor->GetParent(); + } + return ancestor; + }; + + // Checks whether a TensorHandle has been pre-allocated + auto IsPreallocated = [&](ITensorHandle* const tensorHandle) + { + return tensorHandle && preallocatedTensors.find(tensorHandle) != preallocatedTensors.end(); + }; + + // Constant tensor handles need to last from the beginning of execution till the end, + // therefore we pre-allocate them upfront for (auto&& layer : m_Layers) { - for (auto slot = layer->BeginOutputSlots(); slot != layer->EndOutputSlots(); ++slot) + if (layer->GetType() == LayerType::Constant) { - slot->GetOutputHandler().AllocateTensors(); + for (auto&& slot = layer->BeginOutputSlots(); slot != layer->EndOutputSlots(); ++slot) + { + ITensorHandle *tensorHandle = TraceSubTensorHandleAncestry(slot->GetOutputHandler().GetData()); + + if (tensorHandle && !IsPreallocated(tensorHandle)) + { + tensorHandle->Allocate(); + preallocatedTensors.insert(tensorHandle); + } + } } } + + // Iterate over the network in topological order + for (auto&& layer : m_Layers) + { + // Count the amount of times each output slot references a certain buffer (ITensorHandle). + // The first time we encounter a new tensor handle, we start managing its lifetime. + for (auto&& slot = layer->BeginOutputSlots(); slot != layer->EndOutputSlots(); ++slot) + { + ITensorHandle *tensorHandle = TraceSubTensorHandleAncestry(slot->GetOutputHandler().GetData()); + + if (tensorHandle && !IsPreallocated(tensorHandle)) + { + unsigned int numConnections = slot->GetNumConnections(); + if (handleReferenceCounts.find(tensorHandle) == handleReferenceCounts.end()) + { + handleReferenceCounts[tensorHandle] = numConnections; + tensorHandle->Manage(); + } + else + { + handleReferenceCounts[tensorHandle] += numConnections; + } + } + } + + // Loop through the input slots in the same layer and decrement the reference counter associated + // to each tensor handle we encounter. Once it reaches zero, we end the lifetime of the tensor handle + for (auto&& slot = layer->BeginInputSlots(); slot != layer->EndInputSlots(); ++slot) + { + ITensorHandle *tensorHandle = TraceSubTensorHandleAncestry( + slot->GetConnectedOutputSlot()->GetOutputHandler().GetData()); + + if (tensorHandle && !IsPreallocated(tensorHandle)) + { + --handleReferenceCounts[tensorHandle]; + + if (handleReferenceCounts[tensorHandle] == 0u) + { + // Stop managing lifetime of tensor handle + tensorHandle->Allocate(); + handleReferenceCounts.erase(tensorHandle); + } + } + } + } + return Status::Success; } @@ -151,7 +232,7 @@ const Graph& Graph::TopologicalSort() const { if (!m_LayersInOrder) { - //Reset layer order + // Resets layer order. for (auto&& it : m_Layers) { it->ResetPriority(); @@ -178,9 +259,9 @@ void Graph::AddCopyLayers() // CPU -> Neon (and viceversa) auto MayNeedCopyLayer = [](const Layer& layer) { - // All layers should have been associated with a valid compute device at this point + // All layers should have been associated with a valid compute device at this point. BOOST_ASSERT(layer.GetComputeDevice() != Compute::Undefined); - // Do not need another copy layer if copy layer is already present + // Does not need another copy layer if a copy layer is already present. return layer.GetType() != LayerType::MemCopy; }; @@ -191,14 +272,14 @@ void Graph::AddCopyLayers() unsigned int srcOutputIndex = 0; for (auto&& srcOutput : srcLayer->GetOutputSlots()) { - for (auto&& dstInput : srcOutput.GetConnections()) + std::vector connectionCopy = srcOutput.GetConnections(); + for (auto&& dstInput : connectionCopy) { Layer& dstLayer = dstInput->GetOwningLayer(); - if (MayNeedCopyLayer(dstLayer) && (dstLayer.GetComputeDevice() != srcLayer->GetComputeDevice())) { - // A copy layer is needed in between the source and destination layers - // Record the operation rather than attempting to modify the graph as we go + // A copy layer is needed in between the source and destination layers. + // Record the operation rather than attempting to modify the graph as we go. // (invalidating iterators) const std::string copyLayerName = boost::str(boost::format("[ %1% (%2%) -> %3% (%4%) ]") % srcLayer->GetName() diff --git a/src/armnn/Graph.hpp b/src/armnn/Graph.hpp index 06b6fd32ae..fd81e51b7b 100644 --- a/src/armnn/Graph.hpp +++ b/src/armnn/Graph.hpp @@ -5,6 +5,7 @@ #pragma once #include "LayersFwd.hpp" +#include "IGraphObservable.hpp" #include #include @@ -12,6 +13,7 @@ #include #include +#include #include #include #include @@ -21,6 +23,7 @@ namespace armnn { + class Graph { public: @@ -31,7 +34,7 @@ public: } using LayersList = std::list; - using Iterator = LayersList::const_iterator; // const so pointers in the list can't be modified externally + using Iterator = LayersList::const_iterator; // Const so pointers in the list can't be modified externally. using ConstIterator = boost::transform_iterator), Iterator>; using IteratorDifference = Iterator::difference_type; @@ -94,7 +97,7 @@ public: Status SerializeToDot(std::ostream& stream); - /// Adds a new layer of type LaterType to the graph constructed with the arguments passed. + /// Adds a new layer, of type LayerType, to the graph constructed with the arguments passed. template LayerT* AddLayer(Args&&... args); @@ -103,6 +106,10 @@ public: template LayerT* InsertNewLayer(InputSlot& insertBefore, Args&&... args); + /// Inserts a new layer between insertAfter and the input slot(s) currently connected to it + template + LayerT* InsertNewLayer(OutputSlot& insertAfter, Args&&... args); + /// Deletes the layer at the specified position and returns an iterator pointing /// to the next element after the one being deleted. Iterator EraseLayer(Iterator pos); @@ -113,22 +120,22 @@ public: template Iterator EraseLayer(LayerT*& layer); - /// Return iterator pointing to begin of list. Lowercase for range-based for loops. + /// Returns iterator pointing to the beginning of the list. Lowercase for range-based for loops. Iterator begin() { return m_Layers.begin(); } - /// Return iterator pointing to end of list. Lowercase for range-based for loops. + /// Returns iterator pointing to the end of the list. Lowercase for range-based for loops. Iterator end() { return m_Layers.end(); } - /// Return const iterator pointing to begin of list. Lowercase for range-based for loops. + /// Returns const iterator pointing to the beginning of the list. Lowercase for range-based for loops. ConstIterator begin() const { return {m_Layers.begin(), &PtrCast}; } - /// Return const iterator pointing to end of list. Lowercase for range-based for loops. + /// Returns const iterator pointing to the end of the list. Lowercase for range-based for loops. ConstIterator end() const { return {m_Layers.end(), &PtrCast}; } - /// Return const iterator pointing to begin of list. Lowercase for range-based for loops. + /// Returns const iterator pointing to the beginning of the list. Lowercase for range-based for loops. ConstIterator cbegin() const { return begin(); } - /// Return const iterator pointing to end of list. Lowercase for range-based for loops. + /// Returns const iterator pointing to the end of the list. Lowercase for range-based for loops. ConstIterator cend() const { return end(); } - /// Sort layers in topological order and return this. + /// Sorts layers in topological order and return this. Graph& TopologicalSort() { const_cast(this)->TopologicalSort(); return *this; } const Graph& TopologicalSort() const; @@ -136,16 +143,16 @@ public: size_t GetNumOutputs() const { return m_OutputIds.size(); } /// Returns a wrapper object with begin(), end() methods to iterate over the input layers - /// in a range-based for loop + /// in a range-based for loop. InputLayersAccessor GetInputLayers() const { return InputLayersAccessor(*this); } /// Returns a wrapper object with begin(), end() methods to iterate over the output layers - /// in a range-based for loop + /// in a range-based for loop. OutputLayersAccessor GetOutputLayers() const { return OutputLayersAccessor(*this); } size_t GetNumLayers() const { return m_Layers.size(); } - /// Allocate memory for all tensors under output tensor handers of each layer + /// Allocates memory for all tensors under output tensor handers of each layer. Status AllocateDynamicBuffers(); /// Modifies the graph in-place, removing edges connecting layers using different compute devices, @@ -154,6 +161,14 @@ public: void InferTensorInfos(); + void AttachObservable(IGraphObservable* const observable, GraphEvent notifyOnEvent) { + m_Views[notifyOnEvent].emplace_back(observable); + } + + void DetachObservable(IGraphObservable* const observable, GraphEvent notifyOnEvent) { + m_Views[notifyOnEvent].remove(observable); + } + private: template class LayerInGraphBase; @@ -179,9 +194,18 @@ private: return it; } - /// Get the position of a layer in the graph. + /// Gets the position of a layer in the graph. Iterator GetPosInGraph(Layer& layer); + void NotifyObservables(GraphEvent event, Layer* graphState) + { + // Iterate over all observables observing this event + for (auto& observable : m_Views[event]) + { + observable->Update(graphState); + } + } + std::unordered_set m_InputIds; std::unordered_set m_OutputIds; std::unordered_map m_PosInGraphMap; @@ -189,9 +213,11 @@ private: /// Mutable to allow sorting on const object. mutable LayersList m_Layers; mutable bool m_LayersInOrder; + + std::map> m_Views; }; -/// Common base class for layers in the graph +/// Common base class for layers in the graph. template class Graph::LayerInGraphBase : public LayerT { @@ -212,7 +238,7 @@ protected: Graph& m_Graph; }; -/// Input/Output layers specialize this template +/// Input/Output layers specialize this template. template class Graph::LayerInGraph final : public LayerInGraphBase { @@ -305,24 +331,51 @@ inline LayerT* Graph::AddLayer(Args&&... args) { m_LayersInOrder = m_LayersInOrder && ((LayerEnumOf() == LayerType::Input) || (LayerEnumOf() == LayerType::Output)); - return new LayerInGraph(*this, std::forward(args)...); + LayerT* const layer = new LayerInGraph(*this, std::forward(args)...); + + NotifyObservables(GraphEvent::LayerAdded, layer); + + return layer; } template inline LayerT* Graph::InsertNewLayer(InputSlot& insertBefore, Args&&... args) { - // Insert after the parent if any, or before the child otherwise, so topological order is kept. + // Insert after the parent if any, or before the child otherwise, so the topological order is kept. OutputSlot* parentOut = insertBefore.GetConnectedOutputSlot(); const Iterator pos = (parentOut != nullptr) ? std::next(GetPosInGraph(parentOut->GetOwningLayer())) : GetPosInGraph(insertBefore.GetOwningLayer()); LayerT* const layer = new LayerInGraph(*this, pos, std::forward(args)...); insertBefore.Insert(*layer); + + NotifyObservables(GraphEvent::LayerAdded, layer); + + return layer; +} + +template +inline LayerT* Graph::InsertNewLayer(OutputSlot& insertAfter, Args&&... args) +{ + Layer& owningLayer = insertAfter.GetOwningLayer(); + + const Iterator pos = std::next(GetPosInGraph(owningLayer)); + LayerT* const layer = new LayerInGraph(*this, pos, std::forward(args)...); + + BOOST_ASSERT(layer->GetNumInputSlots() == 1); + + insertAfter.MoveAllConnections(layer->GetOutputSlot()); + insertAfter.Connect(layer->GetInputSlot(0)); + + NotifyObservables(GraphEvent::LayerAdded, layer); + return layer; } inline Graph::Iterator Graph::EraseLayer(Iterator pos) { + NotifyObservables(GraphEvent::LayerErased, *pos); + delete *pos; return m_Layers.erase(pos); } diff --git a/src/armnn/Half.hpp b/src/armnn/Half.hpp new file mode 100644 index 0000000000..4a10c3c8ab --- /dev/null +++ b/src/armnn/Half.hpp @@ -0,0 +1,35 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include +#include + +namespace armnn +{ + using Half = half_float::half; //import half float implementation +} //namespace armnn + + +namespace std +{ + +template<> +struct is_floating_point + : integral_constant< bool, true > +{}; + +template<> +struct is_floating_point + : integral_constant< bool, true > +{}; + +template<> +struct is_floating_point + : integral_constant< bool, true > +{}; + +} //namespace std \ No newline at end of file diff --git a/src/armnn/IGraphObservable.hpp b/src/armnn/IGraphObservable.hpp new file mode 100644 index 0000000000..f1779ec1da --- /dev/null +++ b/src/armnn/IGraphObservable.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "Layer.hpp" + +namespace armnn +{ + +enum class GraphEvent +{ + LayerAdded, + LayerErased +}; + +class IGraphObservable +{ +public: + virtual void Update(Layer* graphLayer) = 0; + +protected: + virtual ~IGraphObservable() = default; +}; + +} //namespace armnn + diff --git a/src/armnn/Instrument.hpp b/src/armnn/Instrument.hpp new file mode 100644 index 0000000000..8d3ac5a76c --- /dev/null +++ b/src/armnn/Instrument.hpp @@ -0,0 +1,66 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include +#include + +namespace armnn +{ + +struct Measurement +{ + enum Unit + { + TIME_NS, + TIME_US, + TIME_MS, + }; + + inline static const char* ToString(Unit unit) + { + switch (unit) + { + case TIME_NS: return "ns"; + case TIME_US: return "us"; + case TIME_MS: return "ms"; + default: return ""; + } + } + + Measurement(const std::string& name, double value, Unit unit) + : m_Name(name) + , m_Value(value) + , m_Unit(unit) + {} + Measurement(const Measurement&) = default; + ~Measurement() = default; + + std::string m_Name; + double m_Value; + Unit m_Unit; + +private: + // please don't default construct, otherwise Units will be wrong + Measurement() = delete; +}; + +class Instrument +{ +public: + virtual ~Instrument() {} + + virtual void Start() = 0; + + virtual void Stop() = 0; + + virtual std::vector GetMeasurements() const = 0; + + virtual const char* GetName() const = 0; + +}; + +} //namespace armnn diff --git a/src/armnn/InternalTypes.cpp b/src/armnn/InternalTypes.cpp index e39b15be05..3426da3d24 100644 --- a/src/armnn/InternalTypes.cpp +++ b/src/armnn/InternalTypes.cpp @@ -18,6 +18,8 @@ char const* GetLayerTypeAsCString(LayerType type) case LayerType::Addition: return "Addition"; case LayerType::BatchNormalization: return "BatchNormalization"; case LayerType::Constant: return "Constant"; + case LayerType::ConvertFp16ToFp32: return "ConvertFp16ToFp32"; + case LayerType::ConvertFp32ToFp16: return "ConvertFp32ToFp16"; case LayerType::Convolution2d: return "Convolution2d"; case LayerType::DepthwiseConvolution2d: return "DepthwiseConvolution2d"; case LayerType::FakeQuantization: return "FakeQuantization"; @@ -25,6 +27,7 @@ char const* GetLayerTypeAsCString(LayerType type) case LayerType::FullyConnected: return "FullyConnected"; case LayerType::Input: return "Input"; case LayerType::L2Normalization: return "L2Normalization"; + case LayerType::Lstm: return "Lstm"; case LayerType::MemCopy: return "MemCopy"; case LayerType::Merger: return "Merger"; case LayerType::Multiplication: return "Multiplication"; diff --git a/src/armnn/InternalTypes.hpp b/src/armnn/InternalTypes.hpp index 8db0da4cf2..0968e17b18 100644 --- a/src/armnn/InternalTypes.hpp +++ b/src/armnn/InternalTypes.hpp @@ -18,6 +18,8 @@ enum class LayerType Addition, BatchNormalization, Constant, + ConvertFp16ToFp32, + ConvertFp32ToFp16, Convolution2d, DepthwiseConvolution2d, FakeQuantization, @@ -25,6 +27,7 @@ enum class LayerType FullyConnected, Input, L2Normalization, + Lstm, MemCopy, Merger, Multiplication, @@ -35,7 +38,7 @@ enum class LayerType Reshape, ResizeBilinear, Softmax, - // Last layer goes here + // Last layer goes here. LastLayer, Splitter = LastLayer, }; diff --git a/src/armnn/JsonPrinter.cpp b/src/armnn/JsonPrinter.cpp new file mode 100644 index 0000000000..f7c1c68758 --- /dev/null +++ b/src/armnn/JsonPrinter.cpp @@ -0,0 +1,134 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "JsonPrinter.hpp" + +#include +#include + +namespace armnn +{ + +void JsonPrinter::PrintJsonChildObject(const JsonChildObject& object) +{ + PrintLabel(object.m_Label); + PrintMeasurementsList(object.m_Measurements); + PrintSeparator(); + PrintNewLine(); + PrintUnit(object.m_Unit); + + if (!object.m_Children.empty()) + { + PrintSeparator(); + PrintNewLine(); + for (unsigned int childIndex = 0; childIndex < object.m_Children.size(); ++childIndex) + { + PrintJsonChildObject(object.m_Children[childIndex]); + // Only print separator and new line if current child is not the last element. + if (&object.m_Children[childIndex] != &object.m_Children.back()) + { + PrintSeparator(); + PrintNewLine(); + } + } + } + PrintNewLine(); + PrintFooter(); +} + +void JsonPrinter::PrintHeader() +{ + m_OutputStream << "{" << std::endl; + IncrementNumberOfTabs(); +} + +void JsonPrinter::PrintArmNNHeader() +{ + PrintTabs(); + m_OutputStream << R"("ArmNN": {)" << std::endl; + IncrementNumberOfTabs(); +} + +void JsonPrinter::PrintLabel(const std::string& label) +{ + PrintTabs(); + m_OutputStream << R"(")" << label << R"(": {)" << std::endl; + IncrementNumberOfTabs(); +} + +void JsonPrinter::PrintUnit(armnn::Measurement::Unit unit) +{ + PrintTabs(); + m_OutputStream << R"("unit": ")"; + m_OutputStream << armnn::Measurement::ToString(unit); + m_OutputStream << R"(")"; +} + +void JsonPrinter::PrintMeasurementsList(const std::vector& measurementsVector) +{ + if (measurementsVector.empty()) + { + return; + } + + PrintTabs(); + m_OutputStream << R"("raw": [)" << std::endl; + IncrementNumberOfTabs(); + PrintTabs(); + auto iter = measurementsVector.begin(); + m_OutputStream << *iter; + for (iter = std::next(iter); iter != measurementsVector.end(); ++iter) + { + m_OutputStream << "," << std::endl; + PrintTabs(); + m_OutputStream << *iter; + } + m_OutputStream << std::endl; + DecrementNumberOfTabs(); + PrintTabs(); + m_OutputStream << "]"; +} + +void JsonPrinter::PrintTabs() +{ + unsigned int numTabs = m_NumTabs; + while (numTabs-- > 0) + { + m_OutputStream << "\t"; + } +} + +void JsonPrinter::PrintSeparator() +{ + m_OutputStream << ","; +} + +void JsonPrinter::PrintNewLine() +{ + m_OutputStream << std::endl; +} + +void JsonPrinter::PrintFooter() +{ + DecrementNumberOfTabs(); + PrintTabs(); + m_OutputStream << "}"; +} + +void JsonPrinter::DecrementNumberOfTabs() +{ + if (m_NumTabs == 0) + { + return; + } + --m_NumTabs; +} + +void JsonPrinter::IncrementNumberOfTabs() +{ + ++m_NumTabs; +} + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/JsonPrinter.hpp b/src/armnn/JsonPrinter.hpp new file mode 100644 index 0000000000..1bf9e3175b --- /dev/null +++ b/src/armnn/JsonPrinter.hpp @@ -0,0 +1,82 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include +#include +#include + +#include "Instrument.hpp" + +namespace armnn +{ + +struct JsonChildObject +{ + JsonChildObject(const std::string& label) + : m_Label(label), m_Unit(Measurement::Unit::TIME_MS) + {} + JsonChildObject(const JsonChildObject&) = default; + + void AddMeasurement(const double measurement) + { + m_Measurements.push_back(measurement); + } + + void AddChild(const JsonChildObject& childObject) + { + m_Children.push_back(childObject); + } + + JsonChildObject GetChild(const unsigned int index) + { + return m_Children[index]; + } + + void SetUnit(const Measurement::Unit unit) + { + m_Unit = unit; + } + + ~JsonChildObject() = default; + + std::string m_Label; + Measurement::Unit m_Unit; + std::vector m_Measurements; + std::vector m_Children; + +private: + JsonChildObject() = delete; +}; + +class JsonPrinter +{ +public: + void PrintJsonChildObject(const JsonChildObject& object); + void PrintHeader(); + void PrintArmNNHeader(); + void PrintFooter(); + void PrintSeparator(); + void PrintNewLine(); + void PrintLabel(const std::string& label); + void PrintUnit(armnn::Measurement::Unit unit); + void PrintMeasurementsList(const std::vector& measurementsVector); + +public: + JsonPrinter(std::ostream &outputStream) + : m_OutputStream(outputStream), m_NumTabs(0) + {} + +private: + void PrintTabs(); + void DecrementNumberOfTabs(); + void IncrementNumberOfTabs(); + + std::ostream &m_OutputStream; + unsigned int m_NumTabs; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/Layer.cpp b/src/armnn/Layer.cpp index fcf0656aeb..9f6d75c46b 100644 --- a/src/armnn/Layer.cpp +++ b/src/armnn/Layer.cpp @@ -10,6 +10,7 @@ #include #include #include +#include "backends/CpuTensorHandle.hpp" #include @@ -24,19 +25,19 @@ void InputSlot::Insert(Layer& layer) if (prevSlot != nullptr) { - // Disconnect parent from this + // Disconnects parent from this. prevSlot->Disconnect(*this); - // Connect inserted layer to parent + // Connects inserted layer to parent. BOOST_ASSERT(layer.GetNumInputSlots() == 1); prevSlot->Connect(layer.GetInputSlot(0)); - // Set tensor info for inserted layer + // Sets tensor info for inserted layer. const TensorInfo& tensorInfo = prevSlot->GetTensorInfo(); layer.GetOutputHandler().SetTensorInfo(tensorInfo); } - // Connect inserted layer to this + // Connects inserted layer to this. layer.GetOutputSlot(0).Connect(*this); } @@ -117,11 +118,11 @@ void OutputSlot::ValidateConnectionIndex(unsigned int index) const namespace { LayerGuid GenerateLayerGuid() { - //Note: Not thread safe. + // Note: Not thread safe. static LayerGuid newGuid=0; return newGuid++; } -} //namespace +} // namespace Layer::Layer(unsigned int numInputSlots, unsigned int numOutputSlots, LayerType type, const char* name) : m_OutputHandlers(numOutputSlots) @@ -147,7 +148,7 @@ void Layer::CollectWorkloadInputs(WorkloadDataCollector& dataCollector, const Gr { for (auto&& inputSlot : GetInputSlots()) { - // The graph must be well-formed at this point + // The graph must be well-formed at this point. BOOST_ASSERT(inputSlot.GetConnection()); const OutputHandler& outputHandler = inputSlot.GetConnectedOutputSlot()->GetOutputHandler(); dataCollector.Push(outputHandler.GetData(), outputHandler.GetTensorInfo()); @@ -170,13 +171,22 @@ void Layer::CreateTensorHandles(Graph& graph, const IWorkloadFactory& factory) } } +void Layer::ReleaseConstantData() +{ + // Now free up the static data. + OperateOnConstantTensors([](std::unique_ptr& handle) + { + handle.reset(nullptr); + }); +} + DataType Layer::GetDataType() const { - if (GetNumInputSlots() > 0) // Ignore the input layer + if (GetNumInputSlots() > 0) // Ignore the input layer. { return GetInputSlot(0).GetConnection()->GetTensorInfo().GetDataType(); } - return DataType::Float32; + return GetOutputSlot(0).GetTensorInfo().GetDataType(); } void Layer::ResetPriority() const @@ -226,4 +236,64 @@ LayerPriority Layer::GetPriority() const return m_Priority; } +void Layer::VerifyLayerConnections(unsigned int expectedConnections, const CheckLocation& location) const +{ + BOOST_ASSERT(GetNumInputSlots() == expectedConnections); + + for (unsigned int i=0; iGetType()) + % GetNameStr() + % location.AsString())); + } + if(! GetInputSlot(i).GetConnection()->IsTensorInfoSet()) + { + throw LayerValidationException( + boost::str( + boost::format( + "TensorInfo of Input connection #%1% must be set on connected OutputSlot for " + "%2% layer %3% %4%") + % i + % GetLayerTypeAsCString(this->GetType()) + % GetNameStr() + % location.AsString())); + } + } +} + +std::vector Layer::InferOutputShapes(const std::vector& inputShapes) const +{ + BOOST_ASSERT(GetNumInputSlots() != 0); + BOOST_ASSERT(GetNumOutputSlots() != 0); + + // By default we return what we got, meaning the output shape(s) are the same as the input(s). + // This only works if the number of inputs and outputs are the same. Since we are in the Layer + // base class, this means the implementation needs to be overridden in the specific layers for + // the other cases. So the missing implementation justifies the UnimplementedException. + + if (GetNumInputSlots() != GetNumOutputSlots()) + { + throw UnimplementedException( + boost::str( + boost::format( + "Default implementation for InferOutputShapes can only be used for " + "layers with the same number of input and output slots. This doesn't " + "hold for %1% layer %2% (#inputs=%3% #outputs=%4%) %5%") + % GetLayerTypeAsCString(this->GetType()) + % GetNameStr() + % GetNumInputSlots() + % GetNumOutputSlots() + % CHECK_LOCATION().AsString())); + } + return inputShapes; +} + } // namespace armnn diff --git a/src/armnn/Layer.hpp b/src/armnn/Layer.hpp index 2a199afc24..ebd6b251b4 100644 --- a/src/armnn/Layer.hpp +++ b/src/armnn/Layer.hpp @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include #include @@ -51,7 +53,7 @@ public: const OutputSlot* GetConnectedOutputSlot() const { return m_Connection; } OutputSlot* GetConnectedOutputSlot() { return m_Connection; } - /// Links the slot to an output slot or breaks an existing link if passing nullptr + /// Links the slot to an output slot or breaks an existing link if passing nullptr. void SetConnection(OutputSlot* source) { if (m_Connection != nullptr && source != nullptr) @@ -62,7 +64,7 @@ public: m_Connection = source; } - // Insert single-output existing layer at this point in the graph. + // Inserts single-output existing layer at this point in the graph. void Insert(Layer& layer); // IInputSlot @@ -113,10 +115,10 @@ public: bool ValidateTensorShape(const TensorShape& shape) const; - // Disconnect all conections + // Disconnect all conections. void DisconnectAll(); - /// Move all connections to another OutputSlot + /// Moves all connections to another OutputSlot. void MoveAllConnections(OutputSlot& destination); // IOutputSlot @@ -147,7 +149,7 @@ private: std::vector m_Connections; }; -// InputSlot inlines that need OutputSlot declaration +// InputSlot inlines that need OutputSlot declaration. inline InputSlot::~InputSlot() { @@ -172,6 +174,9 @@ inline InputSlot::~InputSlot() inline const IOutputSlot* InputSlot::GetConnection() const { return GetConnectedOutputSlot(); } inline IOutputSlot* InputSlot::GetConnection() { return GetConnectedOutputSlot(); } + +class ScopedCpuTensorHandle; + // Base layer class using LayerPriority = unsigned int; @@ -179,7 +184,7 @@ using LayerPriority = unsigned int; class Layer : public IConnectableLayer { public: - /// @param name Optional name for the layer (may be nullptr) + /// @param name - Optional name for the layer (may be nullptr). Layer(unsigned int numInputSlots, unsigned int numOutputSlots, LayerType type, const char* name); const std::string& GetNameStr() const @@ -200,15 +205,15 @@ public: const std::vector& GetInputSlots() const { return m_InputSlots; } const std::vector& GetOutputSlots() const { return m_OutputSlots; } - // Allow non-const access to input slots, but don't expose vector (vector size is fixed at layer construction). + // Allows non-const access to input slots, but don't expose vector (vector size is fixed at layer construction). std::vector::iterator BeginInputSlots() { return m_InputSlots.begin(); } std::vector::iterator EndInputSlots() { return m_InputSlots.end(); } - // Allow non-const access to output slots, but don't expose vector (vector size is fixed at layer construction). + // Allows non-const access to output slots, but don't expose vector (vector size is fixed at layer construction). std::vector::iterator BeginOutputSlots() { return m_OutputSlots.begin(); } std::vector::iterator EndOutputSlots() { return m_OutputSlots.end(); } - // Check whether the outputs of this layer don't have any connection + // Checks whether the outputs of this layer don't have any connection. bool IsOutputUnconnected() { unsigned int numConnections = 0; @@ -221,7 +226,7 @@ public: return (GetNumOutputSlots() > 0) && (numConnections == 0); } - // Used for sorting + // Used for sorting. void ResetPriority() const; LayerPriority GetPriority() const; @@ -238,16 +243,35 @@ public: virtual void CreateTensorHandles(Graph& graph, const IWorkloadFactory& factory); - /// Creates a dynamically-allocated copy of this layer - /// @param graph The Graph into which this Layer is being cloned + /// Creates a dynamically-allocated copy of this layer. + /// @param graph - The Graph into which this Layer is being cloned. virtual Layer* Clone(Graph& graph) const = 0; + void VerifyLayerConnections(unsigned int expectedConnections, const CheckLocation& location) const; + virtual void ValidateTensorShapesFromInputs() = 0; - /// Helper to serialize the layer parameters to string - /// (currently used in DotSerializer and company) + std::vector InferOutputShapes(const std::vector& inputShapes) const override; + + /// Helper to serialize the layer parameters to string. + /// (currently used in DotSerializer and company). virtual void SerializeLayerParameters(ParameterStringifyFunction &) const {} + // Free up the constant source data + virtual void ReleaseConstantData(); + + template + void OperateOnConstantTensors(Op op) + { + for (auto constant : GetConstantTensorsByRef()) + { + if (constant.get()) + { + op(constant); + } + } + }; + // IConnectableLayer const char* GetName() const override { return m_LayerName.c_str(); } @@ -263,8 +287,12 @@ public: void SetGuid(LayerGuid guid) { m_Guid = guid; } LayerGuid GetGuid() const final { return m_Guid; } + void AddRelatedLayerName(const std::string layerName) { m_RelatedLayerNames.emplace_back(layerName); } + + const std::list& GetRelatedLayerNames() { return m_RelatedLayerNames; } + protected: - // Graph needs access to the virtual destructor + // Graph needs access to the virtual destructor. friend class Graph; virtual ~Layer() = default; @@ -282,7 +310,7 @@ protected: CollectWorkloadOutputs(dataCollector, graph); } - /// Helper function to reduce duplication in *Layer::CreateWorkload + /// Helper function to reduce duplication in *Layer::CreateWorkload. template WorkloadInfo PrepInfoAndDesc(QueueDescriptor& descriptor, const Graph& graph) const { @@ -295,6 +323,10 @@ protected: template LayerType* CloneBase(Graph& graph, Params&& ... params) const; + // Retrieve the Handles to the constants + using ConstantTensors = std::vector>>; + virtual ConstantTensors GetConstantTensorsByRef() {return ConstantTensors(); }; + private: void CollectWorkloadInputs(WorkloadDataCollector& dataCollector, const Graph& graph) const; void CollectWorkloadOutputs(WorkloadDataCollector& dataCollector, const Graph& graph) const; @@ -311,14 +343,16 @@ private: const LayerType m_Type; Compute m_ComputeDevice; - /// Used for sorting + /// Used for sorting. mutable LayerPriority m_Priority = 0; mutable bool m_Visiting = false; LayerGuid m_Guid; + + std::list m_RelatedLayerNames; }; -// A layer user-provided data can be bound to (e.g. inputs, outputs) +// A layer user-provided data can be bound to (e.g. inputs, outputs). class BindableLayer : public Layer { public: diff --git a/src/armnn/LayerSupport.cpp b/src/armnn/LayerSupport.cpp index a0f6276e2b..a734e03a56 100644 --- a/src/armnn/LayerSupport.cpp +++ b/src/armnn/LayerSupport.cpp @@ -16,20 +16,20 @@ namespace armnn { -// Helper function to copy a full string to a truncated version +/// Helper function to copy a full string to a truncated version. void CopyErrorMessage(char* truncatedString, const char* fullString, size_t maxLength) { if(truncatedString != nullptr) { size_t copyLength = std::min(maxLength, strlen(fullString)); std::strncpy(truncatedString, fullString, copyLength); - // Ensure null-terminated string + // Ensure null-terminated string. truncatedString[copyLength] = '\0'; } } // Helper macro to avoid code duplication. -// Forwards function func to funcRef, funcNeon or funcCl, depending on the value of compute +// Forwards function func to funcRef, funcNeon or funcCl, depending on the value of compute. #define FORWARD_LAYER_SUPPORT_FUNC(compute, func, ...) \ std::string reasonIfUnsupportedFull; \ bool isSupported; \ @@ -58,11 +58,12 @@ bool CheckTensorDataTypesEqual(const TensorInfo& input0, const TensorInfo& input bool IsActivationSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsActivationSupported, input, descriptor); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsActivationSupported, input, output, descriptor); } bool IsAdditionSupported(Compute compute, @@ -82,11 +83,24 @@ bool IsAdditionSupported(Compute compute, bool IsBatchNormalizationSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsBatchNormalizationSupported, input, descriptor); + FORWARD_LAYER_SUPPORT_FUNC(compute, + IsBatchNormalizationSupported, + input, + output, + mean, + var, + beta, + gamma, + descriptor); } bool IsConstantSupported(Compute compute, @@ -97,6 +111,24 @@ bool IsConstantSupported(Compute compute, FORWARD_LAYER_SUPPORT_FUNC(compute, IsConstantSupported, output); } +bool IsConvertFp16ToFp32Supported(Compute compute, + const TensorInfo& input, + const TensorInfo& output, + char* reasonIfUnsupported, + size_t reasonIfUnsupportedMaxLength) +{ + FORWARD_LAYER_SUPPORT_FUNC(compute, IsConvertFp16ToFp32Supported, input, output); +} + +bool IsConvertFp32ToFp16Supported(Compute compute, + const TensorInfo& input, + const TensorInfo& output, + char* reasonIfUnsupported, + size_t reasonIfUnsupportedMaxLength) +{ + FORWARD_LAYER_SUPPORT_FUNC(compute, IsConvertFp32ToFp16Supported, input, output); +} + bool IsConvolution2dSupported(Compute compute, const TensorInfo& input, const TensorInfo& output, @@ -111,12 +143,14 @@ bool IsConvolution2dSupported(Compute compute, bool IsDepthwiseConvolutionSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsDepthwiseConvolutionSupported, input, descriptor, weights); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsDepthwiseConvolutionSupported, input, output, descriptor, weights, biases); } bool IsInputSupported(Compute compute, @@ -129,21 +163,51 @@ bool IsInputSupported(Compute compute, bool IsFullyConnectedSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsFullyConnectedSupported, input, descriptor); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsFullyConnectedSupported, input, output, weights, biases, descriptor); } bool IsL2NormalizationSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsL2NormalizationSupported, input); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsL2NormalizationSupported, input, output); } +bool IsLstmSupported(Compute compute, const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, char* reasonIfUnsupported, + size_t reasonIfUnsupportedMaxLength) + +{ + FORWARD_LAYER_SUPPORT_FUNC(compute, IsLstmSupported, input, outputStateIn, cellStateIn, + scratchBuffer, outputStateOut, cellStateOut, + output, descriptor, inputToForgetWeights, inputToCellWeights, + inputToOutputWeights, recurrentToForgetWeights, + recurrentToCellWeights, recurrentToOutputWeights, + forgetGateBias, cellBias, outputGateBias, + inputToInputWeights, recurrentToInputWeights, + cellToInputWeights, inputGateBias, projectionWeights, + projectionBias, cellToForgetWeights, cellToOutputWeights); +} bool IsMergerSupported(Compute compute, std::vector inputs, const OriginsDescriptor& descriptor, @@ -157,10 +221,11 @@ bool IsMergerSupported(Compute compute, bool IsMultiplicationSupported(Compute compute, const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsMultiplicationSupported, input0, input1); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsMultiplicationSupported, input0, input1, output); } bool IsNormalizationSupported(Compute compute, @@ -211,11 +276,12 @@ bool IsResizeBilinearSupported(Compute compute, bool IsSoftmaxSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsSoftmaxSupported, input, descriptor); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsSoftmaxSupported, input, output, descriptor); } bool IsSplitterSupported(Compute compute, @@ -250,7 +316,7 @@ bool IsFloorSupported(Compute compute, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - // By definition (that is, regardless of compute device), shapes and data type must match + // By definition (that is, regardless of compute device), shapes and data type must match. if (input.GetShape() != output.GetShape() || input.GetDataType() != output.GetDataType()) { return false; diff --git a/src/armnn/LayerSupportCommon.hpp b/src/armnn/LayerSupportCommon.hpp index 5b7feac387..63065c0565 100644 --- a/src/armnn/LayerSupportCommon.hpp +++ b/src/armnn/LayerSupportCommon.hpp @@ -11,17 +11,20 @@ namespace armnn { -template +template bool IsSupportedForDataTypeGeneric(std::string* reasonIfUnsupported, DataType dataType, - Float32Func floatFuncPtr, + Float16Func float16FuncPtr, + Float32Func float32FuncPtr, Uint8Func uint8FuncPtr, Params&&... params) { switch(dataType) { + case DataType::Float16: + return float16FuncPtr(reasonIfUnsupported, std::forward(params)...); case DataType::Float32: - return floatFuncPtr(reasonIfUnsupported, std::forward(params)...); + return float32FuncPtr(reasonIfUnsupported, std::forward(params)...); case DataType::QuantisedAsymm8: return uint8FuncPtr(reasonIfUnsupported, std::forward(params)...); default: @@ -41,6 +44,16 @@ bool FalseFunc(std::string* reasonIfUnsupported, Params&&... params) return false; } +template +bool FalseFuncF16(std::string* reasonIfUnsupported, Params&&... params) +{ + if (reasonIfUnsupported) + { + *reasonIfUnsupported = "Layer is not supported with float16 data type"; + } + return false; +} + template bool FalseFuncF32(std::string* reasonIfUnsupported, Params&&... params) { @@ -61,4 +74,44 @@ bool FalseFuncU8(std::string* reasonIfUnsupported, Params&&... params) return false; } +template +bool FalseInputFuncF32(std::string* reasonIfUnsupported, Params&&... params) +{ + if (reasonIfUnsupported) + { + *reasonIfUnsupported = "Layer is not supported with float32 data type input"; + } + return false; +} + +template +bool FalseInputFuncF16(std::string* reasonIfUnsupported, Params&&... params) +{ + if (reasonIfUnsupported) + { + *reasonIfUnsupported = "Layer is not supported with float16 data type input"; + } + return false; +} + +template +bool FalseOutputFuncF32(std::string* reasonIfUnsupported, Params&&... params) +{ + if (reasonIfUnsupported) + { + *reasonIfUnsupported = "Layer is not supported with float32 data type output"; + } + return false; +} + +template +bool FalseOutputFuncF16(std::string* reasonIfUnsupported, Params&&... params) +{ + if (reasonIfUnsupported) + { + *reasonIfUnsupported = "Layer is not supported with float16 data type output"; + } + return false; +} + } diff --git a/src/armnn/LayersFwd.hpp b/src/armnn/LayersFwd.hpp index 64d5dcea9b..e79149f28f 100644 --- a/src/armnn/LayersFwd.hpp +++ b/src/armnn/LayersFwd.hpp @@ -10,6 +10,8 @@ #include "layers/AdditionLayer.hpp" #include "layers/BatchNormalizationLayer.hpp" #include "layers/ConstantLayer.hpp" +#include "layers/ConvertFp16ToFp32Layer.hpp" +#include "layers/ConvertFp32ToFp16Layer.hpp" #include "layers/Convolution2dLayer.hpp" #include "layers/DepthwiseConvolution2dLayer.hpp" #include "layers/FakeQuantizationLayer.hpp" @@ -17,6 +19,7 @@ #include "layers/FullyConnectedLayer.hpp" #include "layers/InputLayer.hpp" #include "layers/L2NormalizationLayer.hpp" +#include "layers/LstmLayer.hpp" #include "layers/MemCopyLayer.hpp" #include "layers/MergerLayer.hpp" #include "layers/MultiplicationLayer.hpp" @@ -60,6 +63,8 @@ DECLARE_LAYER(Activation) DECLARE_LAYER(Addition) DECLARE_LAYER(BatchNormalization) DECLARE_LAYER(Constant) +DECLARE_LAYER(ConvertFp16ToFp32) +DECLARE_LAYER(ConvertFp32ToFp16) DECLARE_LAYER(Convolution2d) DECLARE_LAYER(DepthwiseConvolution2d) DECLARE_LAYER(FakeQuantization) @@ -67,6 +72,7 @@ DECLARE_LAYER(Floor) DECLARE_LAYER(FullyConnected) DECLARE_LAYER(Input) DECLARE_LAYER(L2Normalization) +DECLARE_LAYER(Lstm) DECLARE_LAYER(MemCopy) DECLARE_LAYER(Merger) DECLARE_LAYER(Multiplication) diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index 3c73d4ccfe..e1f8de3d88 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -27,30 +27,54 @@ namespace armnn using namespace std; +namespace +{ + +template +std::string ToErrorMessage(const char * prefix, const ExceptionType & error) +{ + std::stringstream ss; + ss << prefix << " " << error.what(); + return ss.str(); +} + +#if ARMCOMPUTECL_ENABLED +std::string ToErrorMessage(const char * prefix, const cl::Error& error) +{ + std::stringstream ss; + ss << prefix << " " << error.what() << ". CL error code is: " << error.err(); + return ss.str(); +} +#endif + +} // anonymous + std::unique_ptr LoadedNetwork::MakeLoadedNetwork(std::unique_ptr net, - bool useCpuRefAsFallback) + std::string & errorMessage) { std::unique_ptr loadedNetwork; try { - loadedNetwork.reset(new LoadedNetwork(std::move(net), useCpuRefAsFallback)); + loadedNetwork.reset(new LoadedNetwork(std::move(net))); } catch (const std::runtime_error& error) { - BOOST_LOG_TRIVIAL(error) << "An error occurred when preparing the network workloads: " << error.what(); + errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error); + BOOST_LOG_TRIVIAL(error) << errorMessage; return std::unique_ptr(); } catch (const armnn::Exception& error) { - BOOST_LOG_TRIVIAL(error) << "An error occurred when preparing the network workloads: " << error.what(); + errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error); + BOOST_LOG_TRIVIAL(error) << errorMessage; return std::unique_ptr(); } #if ARMCOMPUTECL_ENABLED catch (const cl::Error& error) { - BOOST_LOG_TRIVIAL(error) << "A CL error occurred attempting to prepare a network workload: " - << error.what() << ". CL error code is: " << error.err(); + errorMessage = ToErrorMessage("A CL error occurred attempting to prepare a network workload: ", error); + BOOST_LOG_TRIVIAL(error) << errorMessage; return std::unique_ptr(); } #endif @@ -58,21 +82,25 @@ std::unique_ptr LoadedNetwork::MakeLoadedNetwork(std::unique_ptr< return loadedNetwork; } -LoadedNetwork::LoadedNetwork(std::unique_ptr net, bool useCpuRefAsFallback) - : m_CpuRef(useCpuRefAsFallback) +LoadedNetwork::LoadedNetwork(std::unique_ptr net) + : m_CpuRef() , m_OptimizedNetwork(std::move(net)) { + // Create a profiler and register it for the current thread. + m_Profiler = std::make_shared(); + ProfilerManager::GetInstance().RegisterProfiler(m_Profiler.get()); + Graph& order = m_OptimizedNetwork->GetGraph().TopologicalSort(); - //first create tensor handlers - //handlers are created before workloads are - //because workload creation can modify some of the handlers - //(for example the splitter and merger layers) + //First create tensor handlers. + //Handlers are created before workloads are. + //Because workload creation can modify some of the handlers, + //(for example the splitter and merger layers). for (auto&& layer : order) { layer->CreateTensorHandles(m_OptimizedNetwork->GetGraph(), GetWorkloadFactory(*layer)); } - //then create workloads + //Then create workloads. for (auto&& layer : order) { const IWorkloadFactory& workloadFactory = GetWorkloadFactory(*layer); @@ -82,7 +110,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, bool useCpuR case LayerType::Input: case LayerType::Output: { - // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput() + // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput(). break; } default: @@ -99,15 +127,17 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr net, bool useCpuR } m_WorkloadQueue.push_back(move(workload)); + // release the constant data in the layer.. + layer->ReleaseConstantData(); break; } } } - // set up memory + // Set up memory. m_OptimizedNetwork->GetGraph().AllocateDynamicBuffers(); - // finalize the workload factories before execution + // Finalize the workload factories before execution. m_CpuRef.Finalize(); m_CpuAcc.Finalize(); m_GpuAcc.Finalize(); @@ -159,17 +189,20 @@ const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) co break; } case Compute::CpuRef: - default: { workloadFactory = &m_CpuRef; break; } + default: + { + break; + } } BOOST_ASSERT_MSG(workloadFactory, "No workload factory"); std::string reasonIfUnsupported; - BOOST_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer, layer.GetDataType(), reasonIfUnsupported), + BOOST_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer, {}, reasonIfUnsupported), "Factory does not support layer"); boost::ignore_unused(reasonIfUnsupported); @@ -273,19 +306,18 @@ private: Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors) { - ARMNN_UPDATE_PROFILING_EVENT_TAG(); ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "EnqueueWorkload"); const Graph& graph = m_OptimizedNetwork->GetGraph(); - // Walk graph to determine the order of execution + // Walk graph to determine the order of execution. if (graph.GetNumLayers() < 2) { BOOST_LOG_TRIVIAL(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph"; return Status::Failure; } - // Data that must be kept alive for the entire execution of the workload + // Data that must be kept alive for the entire execution of the workload. WorkloadData workloadData(inputTensors, outputTensors); if (graph.GetNumInputs() != inputTensors.size()) @@ -293,14 +325,14 @@ Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, throw InvalidArgumentException("Number of inputs provided does not match network."); } - // for each input to the network, call EnqueueInput with the data passed by the user + // For each input to the network, call EnqueueInput with the data passed by the user. for (const BindableLayer* inputLayer : graph.GetInputLayers()) { const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId()); EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); } - // for each output to the network, call EnqueueOutput with the data passed by the user + // For each output to the network, call EnqueueOutput with the data passed by the user. for (const BindableLayer* outputLayer : graph.GetOutputLayers()) { const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId()); @@ -315,7 +347,7 @@ Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, executionSucceeded = Execute(); } - // Hack: get rid of inputs and outputs we added + // Hack: get rid of inputs and outputs we added. TidyWorkloadQueue(graph.GetNumInputs(), graph.GetNumOutputs()); return executionSucceeded ? Status::Success : Status::Failure; @@ -374,7 +406,7 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten BOOST_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input."); - // Get the output handler from the previous node + // Gets the output handler from the previous node. const OutputHandler& outputHandler = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler(); const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo(); @@ -394,6 +426,10 @@ bool LoadedNetwork::Execute() { bool success = true; + m_CpuRef.Acquire(); + m_CpuAcc.Acquire(); + m_GpuAcc.Acquire(); + try { for (size_t i = 0; i < m_WorkloadQueue.size(); ++i) @@ -415,6 +451,11 @@ bool LoadedNetwork::Execute() success = false; } + // Informs the memory managers to release memory in it's respective memory group + m_CpuRef.Release(); + m_CpuAcc.Release(); + m_GpuAcc.Release(); + return success; } diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp index 79a0b267e9..286f804234 100644 --- a/src/armnn/LoadedNetwork.hpp +++ b/src/armnn/LoadedNetwork.hpp @@ -8,6 +8,7 @@ #include "armnn/Types.hpp" #include "Network.hpp" #include "LayerFwd.hpp" +#include "Profiling.hpp" #include "backends/RefWorkloadFactory.hpp" #include "backends/NeonWorkloadFactory.hpp" #include "backends/ClWorkloadFactory.hpp" @@ -33,10 +34,15 @@ public: Status EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors); static std::unique_ptr MakeLoadedNetwork(std::unique_ptr net, - bool useCpuRefAsFallback); + std::string & errorMessage); + + // NOTE we return by reference as the purpose of this method is only to provide + // access to the private m_Profiler and in theory we should not need to increment + // the shared_ptr's reference counter + const std::shared_ptr& GetProfiler() const { return m_Profiler; } private: - LoadedNetwork(std::unique_ptr net, bool useCpuRefAsFallback); + LoadedNetwork(std::unique_ptr net); void EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo); @@ -54,6 +60,7 @@ private: std::unique_ptr m_OptimizedNetwork; std::vector< std::unique_ptr > m_WorkloadQueue; + std::shared_ptr m_Profiler; }; } diff --git a/src/armnn/NeonInterceptorScheduler.cpp b/src/armnn/NeonInterceptorScheduler.cpp new file mode 100644 index 0000000000..fc95ef439e --- /dev/null +++ b/src/armnn/NeonInterceptorScheduler.cpp @@ -0,0 +1,57 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonInterceptorScheduler.hpp" + +#include + +namespace armnn{ + +NeonInterceptorScheduler::NeonInterceptorScheduler(NeonTimer::KernelMeasurements& kernels, + arm_compute::IScheduler &realScheduler) + : m_Kernels(kernels), m_RealScheduler(realScheduler) +{ +} + +void NeonInterceptorScheduler::set_num_threads(unsigned int numThreads) +{ + m_RealScheduler.set_num_threads(numThreads); +} + +unsigned int NeonInterceptorScheduler::num_threads() const +{ + return m_RealScheduler.num_threads(); +} + +void NeonInterceptorScheduler::schedule(arm_compute::ICPPKernel* kernel, const Hints& hints) +{ + m_Timer.Start(); + m_RealScheduler.schedule(kernel, hints.split_dimension()); + m_Timer.Stop(); + + std::vector measurements = m_Timer.GetMeasurements(); + BOOST_ASSERT(!measurements.empty()); + + Measurement measurement(measurements.front()); // NOTE: 1st measurement is delta + measurement.m_Name = kernel->name(); + m_Kernels.push_back(std::move(measurement)); +} + +void NeonInterceptorScheduler::run_workloads(std::vector & workloads) +{ + m_Timer.Start(); + m_RealScheduler.run_workloads(workloads); + m_Timer.Stop(); + + std::vector measurements = m_Timer.GetMeasurements(); + BOOST_ASSERT_MSG(measurements.size() == 3, "WallClockTimer does not have correct amount of measurements."); + + // WallClockTimer has 3 measurements, duration always being the first. + Measurement measurement(measurements.front()); + measurement.m_Name = "Workload"; + m_Kernels.push_back(std::move(measurement)); +} + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/NeonInterceptorScheduler.hpp b/src/armnn/NeonInterceptorScheduler.hpp new file mode 100644 index 0000000000..b8ecbd59c2 --- /dev/null +++ b/src/armnn/NeonInterceptorScheduler.hpp @@ -0,0 +1,37 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "NeonTimer.hpp" +#include "WallClockTimer.hpp" + +#include +#include +#include + +namespace armnn +{ + +class NeonInterceptorScheduler : public arm_compute::IScheduler +{ +public: + NeonInterceptorScheduler(NeonTimer::KernelMeasurements &kernels, arm_compute::IScheduler &realScheduler); + ~NeonInterceptorScheduler() = default; + + void set_num_threads(unsigned int numThreads) override; + + unsigned int num_threads() const override; + + void schedule(arm_compute::ICPPKernel *kernel, const Hints &hints) override; + + void run_workloads(std::vector &workloads) override; + +private: + NeonTimer::KernelMeasurements& m_Kernels; + arm_compute::IScheduler& m_RealScheduler; + WallClockTimer m_Timer; +}; + +} // namespace armnn diff --git a/src/armnn/NeonTimer.cpp b/src/armnn/NeonTimer.cpp new file mode 100644 index 0000000000..0c1e2e6a34 --- /dev/null +++ b/src/armnn/NeonTimer.cpp @@ -0,0 +1,56 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonTimer.hpp" +#include "NeonInterceptorScheduler.hpp" + +#include + +#include +#include + +namespace armnn +{ + +void NeonTimer::Start() +{ + m_Kernels.clear(); + m_RealSchedulerType = arm_compute::Scheduler::get_type(); + //Note: We can't currently replace a custom scheduler + if(m_RealSchedulerType != arm_compute::Scheduler::Type::CUSTOM) + { + // Keep the real schedule and add NeonInterceptorScheduler as an interceptor + m_RealScheduler = &arm_compute::Scheduler::get(); + auto interceptor = std::make_shared(m_Kernels, *m_RealScheduler); + arm_compute::Scheduler::set(std::static_pointer_cast(interceptor)); + } +} + +void NeonTimer::Stop() +{ + // Restore real scheduler + arm_compute::Scheduler::set(m_RealSchedulerType); + m_RealScheduler = nullptr; +} + +std::vector NeonTimer::GetMeasurements() const +{ + std::vector measurements = m_Kernels; + unsigned int kernel_number = 0; + for (auto & kernel : measurements) + { + std::string kernelName = std::string(this->GetName()) + "/" + std::to_string(kernel_number++) + ": " + kernel + .m_Name; + kernel.m_Name = kernelName; + } + return measurements; +} + +const char* NeonTimer::GetName() const +{ + return "NeonKernelTimer"; +} + +} diff --git a/src/armnn/NeonTimer.hpp b/src/armnn/NeonTimer.hpp new file mode 100644 index 0000000000..5685c4a6fe --- /dev/null +++ b/src/armnn/NeonTimer.hpp @@ -0,0 +1,43 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "Instrument.hpp" + +#include +#include +#include + +#include +#include +#include + +namespace armnn +{ + +class NeonTimer : public Instrument +{ +public: + using KernelMeasurements = std::vector; + + NeonTimer() = default; + ~NeonTimer() = default; + + void Start() override; + + void Stop() override; + + std::vector GetMeasurements() const override; + + const char* GetName() const override; + +private: + KernelMeasurements m_Kernels; + arm_compute::IScheduler* m_RealScheduler; + arm_compute::Scheduler::Type m_RealSchedulerType; +}; + +} \ No newline at end of file diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index 0a5325c2a4..f510207c06 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -5,16 +5,21 @@ #include "Network.hpp" #include "Graph.hpp" #include "Layer.hpp" +#include "DeviceSpec.hpp" #include "backends/CpuTensorHandle.hpp" #include "backends/WorkloadFactory.hpp" #include "Optimizer.hpp" +#include "armnn/Exceptions.hpp" #include +#include #include #include #include #include +#include +#include #include #include @@ -22,6 +27,8 @@ #include #include +#include "optimizations/All.hpp" + namespace armnn { @@ -62,43 +69,195 @@ Status OptimizedNetwork::SerializeToDot(std::ostream& stream) const return m_Graph->SerializeToDot(stream); } -IOptimizedNetworkPtr Optimize(const INetwork& inNetwork, const DeviceSpec& deviceSpec) +IOptimizedNetworkPtr Optimize(const INetwork& inNetwork, + const std::vector& backendPreferences, + const IDeviceSpec& deviceSpec, + const OptimizerOptions& options) { + if (backendPreferences.empty()) { + throw armnn::InvalidArgumentException("Invoked Optimize with no backends specified"); + } const Network& network = *boost::polymorphic_downcast(&inNetwork); std::unique_ptr graph = std::make_unique(network.GetGraph()); - OptimizedNetwork* optNet = new OptimizedNetwork(std::move(graph)); + auto optNet = IOptimizedNetworkPtr(new OptimizedNetwork(std::move(graph)), &IOptimizedNetwork::Destroy); - Optimizer::Optimize(optNet->GetGraph()); + OptimizedNetwork* optNetObjPtr = boost::polymorphic_downcast(optNet.get()); + + // Perform optimisation passes + using namespace optimizations; + Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(SquashEqualPermuteSiblings(), + SquashEqualReshapeSiblings(), + OptimizeInversePermutes(), + MovePermuteUp(), + PermuteAsReshape(), + OptimizeConsecutiveReshapes())); // Infer the tensor infos for all output slots. Throws an exception on failure. - optNet->GetGraph().InferTensorInfos(); + optNetObjPtr->GetGraph().InferTensorInfos(); - // Assign a compute device for all nodes - for (auto&& layer : optNet->GetGraph()) + // if Fp32 to Fp16 optimization is set convert Fp32 network to Fp16 + if (options.m_ReduceFp32ToFp16) { - DataType dataType = layer->GetDataType(); + Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(Fp32NetworkToFp16Converter())); + } + + // We know that DeviceSpec should be the only implementation of IDeviceSpec. + const DeviceSpec& spec = *boost::polymorphic_downcast(&deviceSpec); + + // determine which of the preferred backends we have available for use + // and whether we have specified CpuRef as one of those backends. + bool cpuRefUsed = false; + std::vector availablePreferredBackends; + for (const armnn::Compute& backend : backendPreferences) + { + // Check if the backend is in the available backend devices. + if (std::find(spec.m_SupportedComputeDevices.begin(), + spec.m_SupportedComputeDevices.end(), backend) != + spec.m_SupportedComputeDevices.end()) + { + availablePreferredBackends.push_back(backend); + if (armnn::Compute::CpuRef == backend) { + cpuRefUsed = true; + } + } + } + if (availablePreferredBackends.empty()) { + BOOST_LOG_TRIVIAL(warning) << "None of the preferred backends " << backendPreferences + << " are supported. Current platform provides " << spec.m_SupportedComputeDevices; + return {nullptr, &IOptimizedNetwork::Destroy}; + } - // Default to the user-requested compute device from the Runtime - layer->SetComputeDevice(deviceSpec.DefaultComputeDevice); + auto ReturnWithError = [&](Layer* layer) + { + BOOST_LOG_TRIVIAL(warning) << "Layer of type " << GetLayerTypeAsCString(layer->GetType()) + << " is not supported on any preferred backend " << backendPreferences; + return IOptimizedNetworkPtr(nullptr, &IOptimizedNetwork::Destroy); + }; - // If the layer is unsupported by this device, fall back to reference + // Assign a compute device for all nodes + for (auto&& layer : optNetObjPtr->GetGraph()) + { + DataType dataType = layer->GetDataType(); std::string reasonIfUnsupported; - if (!IWorkloadFactory::IsLayerSupported(*layer, dataType, reasonIfUnsupported)) + bool found = false; + for (const armnn::Compute& backend : availablePreferredBackends) { - BOOST_LOG_TRIVIAL(warning) << "Layer of type " << GetLayerTypeAsCString(layer->GetType()) << - " is not supported on requested backend " << layer->GetComputeDevice() << " (reason: " << - reasonIfUnsupported << "), falling back to CpuRef backend."; - layer->SetComputeDevice(Compute::CpuRef); + // need to set the compute device on the layer + // before we can check if it is supported + layer->SetComputeDevice(backend); + if (!IWorkloadFactory::IsLayerSupported(*layer, dataType, reasonIfUnsupported)) + { + if (dataType == DataType::Float16) + { + if (IWorkloadFactory::IsLayerSupported(*layer, DataType::Float32, reasonIfUnsupported) + && layer->GetType() != LayerType::ConvertFp32ToFp16 + && layer->GetType() != LayerType::ConvertFp16ToFp32) + { + // Insert FP16 -> FP32 conversion layer before current layer + std::vector convertFp16ToFp32Layers = + InsertConvertFp16ToFp32LayersBefore(optNetObjPtr->GetGraph(), *layer); + + // Insert FP32 -> FP16 conversion layer after current layer + std::vector convertFp32ToFp16Layers = + InsertConvertFp32ToFp16LayersAfter(optNetObjPtr->GetGraph(), *layer); + + // Assign a supported backend to the newly introduced conversion layers + auto AssignFirstSupportedBackend = [&](Layer* layer, Compute preferredBackend) + { + bool supportedBackendFound = false; + std::string reasonIfUnsupported; + + // Try preferred backend first + layer->SetComputeDevice(preferredBackend); + if (IWorkloadFactory::IsLayerSupported(*layer, boost::none, reasonIfUnsupported)) + { + supportedBackendFound = true; + } + else + { + for (const Compute& backend : availablePreferredBackends) + { + // Skip preferred backend (we already determined that it is not supported) + if (backend == preferredBackend) + { + continue; + } + + layer->SetComputeDevice(backend); + if (IWorkloadFactory::IsLayerSupported(*layer, boost::none, reasonIfUnsupported)) + { + supportedBackendFound = true; + break; + } + } + } + + return supportedBackendFound; + }; + + for (ConvertFp16ToFp32Layer* convertLayer : convertFp16ToFp32Layers) + { + if (!AssignFirstSupportedBackend(convertLayer, backend)) + { + return ReturnWithError(convertLayer); + } + } + + for (ConvertFp32ToFp16Layer* convertLayer : convertFp32ToFp16Layers) + { + if (!AssignFirstSupportedBackend(convertLayer, backend)) + { + return ReturnWithError(convertLayer); + } + } + + found = true; + break; + } + } + BOOST_LOG_TRIVIAL(warning) << "Layer of type " << GetLayerTypeAsCString(layer->GetType()) + << " is not supported on requested backend " << layer->GetComputeDevice() + << " (reason: " << reasonIfUnsupported + << "), falling back to the next backend."; + } + else + { + found = true; + break; + } } - BOOST_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(*layer, dataType, reasonIfUnsupported), - "Layer has no valid compute device"); + // If the layer is unsupported by any devices, log and return a null network. + if (!found) { + // NOTE: if the layer is not an operation queue type AND we have not got CpuRef as a + // fallback we should set the compute device on the layer to CpuRef (these are not + // available as accelerated operations, or are only available under certain + // conditions, currently they comprise MemCopy, Constant, Permute) + armnn::LayerType layerType = layer->GetType(); + if (!cpuRefUsed && (layerType == armnn::LayerType::MemCopy || + layerType == armnn::LayerType::Constant || + layerType == armnn::LayerType::Permute)) + { + layer->SetComputeDevice(armnn::Compute::CpuRef); + } + else + { + return ReturnWithError(layer); + } + } } - optNet->GetGraph().AddCopyLayers(); + Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(OptimizeInverseConversionsFp16(), + OptimizeInverseConversionsFp32())); + + optNetObjPtr->GetGraph().AddCopyLayers(); + + // Convert constants + Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(ConvertConstantsFloatToHalf())); + Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(ConvertConstantsHalfToFloat())); - return {optNet, &IOptimizedNetwork::Destroy}; + return optNet; } Network::Network() @@ -116,9 +275,9 @@ IConnectableLayer* Network::AddInputLayer(LayerBindingId id, const char* name) } IConnectableLayer* Network::AddFullyConnectedLayerImpl(const FullyConnectedDescriptor& fullyConnectedDescriptor, - const ConstTensor& weights, - const ConstTensor* biases, - const char* name) + const ConstTensor& weights, + const ConstTensor* biases, + const char* name) { if (fullyConnectedDescriptor.m_BiasEnabled && (biases == nullptr)) { @@ -138,24 +297,24 @@ IConnectableLayer* Network::AddFullyConnectedLayerImpl(const FullyConnectedDescr } IConnectableLayer* Network::AddFullyConnectedLayer(const FullyConnectedDescriptor& fullyConnectedDescriptor, - const ConstTensor& weights, - const char* name) + const ConstTensor& weights, + const char* name) { return AddFullyConnectedLayerImpl(fullyConnectedDescriptor, weights, nullptr, name); } IConnectableLayer* Network::AddFullyConnectedLayer(const FullyConnectedDescriptor& fullyConnectedDescriptor, - const ConstTensor& weights, - const ConstTensor& biases, - const char* name) + const ConstTensor& weights, + const ConstTensor& biases, + const char* name) { return AddFullyConnectedLayerImpl(fullyConnectedDescriptor, weights, &biases, name); } IConnectableLayer* Network::AddConvolution2dLayerImpl(const Convolution2dDescriptor& convolution2dDescriptor, - const ConstTensor& weights, - const ConstTensor* biases, - const char* name) + const ConstTensor& weights, + const ConstTensor* biases, + const char* name) { if (convolution2dDescriptor.m_BiasEnabled && (biases == nullptr)) { @@ -175,15 +334,15 @@ IConnectableLayer* Network::AddConvolution2dLayerImpl(const Convolution2dDescrip } IConnectableLayer* Network::AddConvolution2dLayer(const Convolution2dDescriptor& convolution2dDescriptor, - const ConstTensor& weights, - const char* name) + const ConstTensor& weights, + const char* name) { return AddConvolution2dLayerImpl(convolution2dDescriptor, weights, nullptr, name); } IConnectableLayer* Network::AddConvolution2dLayer(const Convolution2dDescriptor& convolution2dDescriptor, - const ConstTensor& weights, - const ConstTensor& biases, - const char* name) + const ConstTensor& weights, + const ConstTensor& biases, + const char* name) { return AddConvolution2dLayerImpl(convolution2dDescriptor, weights, &biases, name); } @@ -199,7 +358,8 @@ IConnectableLayer* Network::AddDepthwiseConvolution2dLayerImpl( throw InvalidArgumentException("AddDepthwiseConvolution2dLayer: biases cannot be NULL"); } - const auto layer = m_Graph->AddLayer(convolution2dDescriptor, name); + const auto layer = m_Graph->AddLayer(convolution2dDescriptor, + name); layer->m_Weight = std::make_unique(weights); @@ -245,7 +405,8 @@ IConnectableLayer* Network::AddActivationLayer(const ActivationDescriptor& activ return m_Graph->AddLayer(activationDescriptor, name); } -IConnectableLayer* Network::AddNormalizationLayer(const NormalizationDescriptor& normalizationDescriptor, +IConnectableLayer* Network::AddNormalizationLayer(const NormalizationDescriptor& +normalizationDescriptor, const char* name) { return m_Graph->AddLayer(normalizationDescriptor, name); @@ -301,7 +462,8 @@ IConnectableLayer* Network::AddBatchNormalizationLayer(const BatchNormalizationD return layer; } -IConnectableLayer* Network::AddResizeBilinearLayer(const ResizeBilinearDescriptor& resizeDescriptor, const char* name) +IConnectableLayer* Network::AddResizeBilinearLayer(const ResizeBilinearDescriptor& +resizeDescriptor, const char* name) { return m_Graph->AddLayer(resizeDescriptor,name); } @@ -313,10 +475,15 @@ IConnectableLayer* Network::AddL2NormalizationLayer(const char* name) IConnectableLayer* Network::AddConstantLayer(const ConstTensor& input, const char* name) { - return m_Graph->AddLayer(std::make_shared(input), name); + auto layer = m_Graph->AddLayer(name); + + layer->m_LayerOutput = std::make_unique(input); + + return layer; } -IConnectableLayer* Network::AddReshapeLayer(const ReshapeDescriptor& reshapeDescriptor, const char* name) +IConnectableLayer* Network::AddReshapeLayer(const ReshapeDescriptor& reshapeDescriptor, + const char* name) { return m_Graph->AddLayer(reshapeDescriptor, name); } @@ -326,6 +493,97 @@ IConnectableLayer* Network::AddFloorLayer(const char* name) return m_Graph->AddLayer(name); } +IConnectableLayer* Network::AddLstmLayer(const LstmDescriptor& descriptor, + const LstmInputParams& params, + const char* name) +{ + const auto layer = m_Graph->AddLayer(descriptor, name); + + //Lstm Basic Parameters + layer->m_BasicParameters.m_InputToForgetWeights = + std::make_unique(*(params.m_InputToForgetWeights)); + layer->m_BasicParameters.m_InputToCellWeights = + std::make_unique(*(params.m_InputToCellWeights)); + layer->m_BasicParameters.m_InputToOutputWeights = + std::make_unique(*(params.m_InputToOutputWeights)); + layer->m_BasicParameters.m_RecurrentToForgetWeights = + std::make_unique(*(params.m_RecurrentToForgetWeights)); + layer->m_BasicParameters.m_RecurrentToCellWeights = + std::make_unique(*(params.m_RecurrentToCellWeights)); + layer->m_BasicParameters.m_RecurrentToOutputWeights = + std::make_unique(*(params.m_RecurrentToOutputWeights)); + layer->m_BasicParameters.m_ForgetGateBias = + std::make_unique(*(params.m_ForgetGateBias)); + layer->m_BasicParameters.m_CellBias = + std::make_unique(*(params.m_CellBias)); + layer->m_BasicParameters.m_OutputGateBias = + std::make_unique(*(params.m_OutputGateBias)); + + //Lstm Cifg parameters + if(!descriptor.m_CifgEnabled) + { + if(params.m_InputToInputWeights == nullptr) + { + throw InvalidArgumentException("AddLstmLayer: Input To Input Weights cannot be NULL"); + } + if(params.m_RecurrentToInputWeights == nullptr) + { + throw InvalidArgumentException( + "AddLstmLayer: Recurrent To Input Weights cannot be NULL"); + } + if(params.m_InputGateBias == nullptr) + { + throw InvalidArgumentException("AddLstmLayer: Input Gate Bias cannot be NULL"); + } + layer->m_CifgParameters.m_InputToInputWeights = + std::make_unique(*(params.m_InputToInputWeights)); + layer->m_CifgParameters.m_RecurrentToInputWeights = + std::make_unique(*(params.m_RecurrentToInputWeights)); + // In the VTS tests, cell-to-input weights may be null, even if the other CIFG params are not. + if(params.m_CellToInputWeights != nullptr) + { + layer->m_CifgParameters.m_CellToInputWeights = + std::make_unique(*(params.m_CellToInputWeights)); + } + layer->m_CifgParameters.m_InputGateBias = + std::make_unique(*(params.m_InputGateBias)); + } + + //Lstm projection parameters + if(descriptor.m_ProjectionEnabled) + { + if(params.m_ProjectionWeights == nullptr) + { + throw InvalidArgumentException("AddLstmLayer: Projection Weights cannot be NULL"); + } + layer->m_ProjectionParameters.m_ProjectionWeights = + std::make_unique(*(params.m_ProjectionWeights)); + if(params.m_ProjectionBias != nullptr) + { + layer->m_ProjectionParameters.m_ProjectionBias = + std::make_unique(*(params.m_ProjectionBias)); + } + } + + //Lstm Peephole params + if(descriptor.m_PeepholeEnabled) + { + if(params.m_CellToForgetWeights == nullptr) + { + throw InvalidArgumentException("AddLstmLayer: Cell To Forget Weights cannot be NULL"); + } + if(params.m_CellToOutputWeights == nullptr) + { + throw InvalidArgumentException("AddLstmLayer: Cell To Output Weights cannot be NULL"); + } + layer->m_PeepholeParameters.m_CellToForgetWeights = + std::make_unique(*(params.m_CellToForgetWeights)); + layer->m_PeepholeParameters.m_CellToOutputWeights = + std::make_unique(*(params.m_CellToOutputWeights)); + } + return layer; +} + OptimizedNetwork::OptimizedNetwork(std::unique_ptr graph) : m_Graph(std::move(graph)) { @@ -336,4 +594,3 @@ OptimizedNetwork::~OptimizedNetwork() } } // namespace armnn - diff --git a/src/armnn/Network.hpp b/src/armnn/Network.hpp index 4eb67b1a15..72100aae6c 100644 --- a/src/armnn/Network.hpp +++ b/src/armnn/Network.hpp @@ -5,6 +5,7 @@ #pragma once #include +#include #include #include @@ -20,7 +21,7 @@ namespace armnn { class Graph; -/// Private implementation of INetwork +/// Private implementation of INetwork. class Network final : public INetwork { public: @@ -108,6 +109,10 @@ public: IConnectableLayer* AddOutputLayer(LayerBindingId id, const char* name = nullptr) override; + IConnectableLayer* AddLstmLayer(const LstmDescriptor& descriptor, + const LstmInputParams& params, + const char* name = nullptr) override; + private: IConnectableLayer* AddFullyConnectedLayerImpl(const FullyConnectedDescriptor& fullyConnectedDescriptor, const ConstTensor& weights, diff --git a/src/armnn/NetworkUtils.hpp b/src/armnn/NetworkUtils.hpp new file mode 100644 index 0000000000..0228813a25 --- /dev/null +++ b/src/armnn/NetworkUtils.hpp @@ -0,0 +1,79 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "Graph.hpp" + +namespace armnn +{ + +inline std::vector InsertConvertFp16ToFp32LayersBefore(Graph& graph, Layer& layer) +{ + std::vector convertLayers; + convertLayers.reserve(layer.GetNumInputSlots()); + + for (auto&& inputSlot = layer.BeginInputSlots(); inputSlot != layer.EndInputSlots(); ++inputSlot) + { + // Insert FP16 to FP32 converter layer before the layer + const std::string name = + std::string("convert_fp16_to_fp32-" + std::to_string(inputSlot->GetSlotIndex()) + "-") + layer.GetName(); + ConvertFp16ToFp32Layer* convertLayer = + graph.InsertNewLayer(*inputSlot, name.c_str()); + + // Sets output tensor info for the convert layer + TensorInfo convertInfo = convertLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); + convertInfo.SetDataType(DataType::Float32); + + convertLayer->GetOutputSlot().SetTensorInfo(convertInfo); + + convertLayers.emplace_back(convertLayer); + } + + // Sets the output tensor info for the unsupported layer + auto UpdateTensorInfo = [](auto& outputSlot) + { + // Copy original tensor info and change data type to FP32 + TensorInfo newTensorInfo = outputSlot.GetTensorInfo(); + newTensorInfo.SetDataType(DataType::Float32); + + outputSlot.SetTensorInfo(newTensorInfo); + }; + + std::for_each(layer.BeginOutputSlots(), layer.EndOutputSlots(), UpdateTensorInfo); + + return convertLayers; +} + +inline std::vector InsertConvertFp32ToFp16LayersAfter(Graph& graph, Layer& layer) +{ + std::vector convertLayers; + convertLayers.reserve(layer.GetNumOutputSlots()); + + int index = 0; + // Change outputs to DataType::Float16 + for (auto&& outputSlot = layer.BeginOutputSlots(); outputSlot != layer.EndOutputSlots(); ++outputSlot) + { + BOOST_ASSERT(outputSlot->GetTensorInfo().GetDataType() == DataType::Float32); + + // Insert FP32 to FP16 converter layer after the layer + const std::string name = + std::string("convert_fp32_to_fp16-" + std::to_string(index++) + "-") + layer.GetName(); + ConvertFp32ToFp16Layer* convertLayer = + graph.InsertNewLayer(*outputSlot, name.c_str()); + + // Sets output tensor info for the convert layer. + TensorInfo convertInfo = convertLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); + convertInfo.SetDataType(DataType::Float16); + + convertLayer->GetOutputSlot().SetTensorInfo(convertInfo); + + convertLayers.emplace_back(convertLayer); + } + + return convertLayers; +} + +} //namespace armnn \ No newline at end of file diff --git a/src/armnn/Observable.cpp b/src/armnn/Observable.cpp new file mode 100644 index 0000000000..7179a10ccd --- /dev/null +++ b/src/armnn/Observable.cpp @@ -0,0 +1,36 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "Observable.hpp" + +namespace armnn +{ + +void AddedLayerObservable::Update(Layer* graphLayer) +{ + m_ObservedObjects.emplace_back(graphLayer); +} + +void ErasedLayerNamesObservable::Update(Layer* graphLayer) +{ + auto& relatedLayerNames = graphLayer->GetRelatedLayerNames(); + + // If the erased layer has no related layers we take the erased layer's name + // Otherwise we need to preserve the related layer names, + // since we want to preserve the original graph's information + if (relatedLayerNames.empty()) + { + m_ObservedObjects.emplace_back(graphLayer->GetName()); + } + else + { + for (auto& relatedLayerName : relatedLayerNames) + { + m_ObservedObjects.emplace_back(relatedLayerName); + } + } +} + +} diff --git a/src/armnn/Observable.hpp b/src/armnn/Observable.hpp new file mode 100644 index 0000000000..8f33c0b3e3 --- /dev/null +++ b/src/armnn/Observable.hpp @@ -0,0 +1,67 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "IGraphObservable.hpp" +#include "Graph.hpp" + +namespace armnn +{ + +template +class GraphObservable : public IGraphObservable +{ +public: + using Iterator = typename std::list::const_iterator; + + GraphObservable(Graph& subject, GraphEvent notifyOnEvent) + : m_Subject(&subject) + { + m_NotifyOnEvent = notifyOnEvent; + m_Subject->AttachObservable(this, m_NotifyOnEvent); + }; + + void Clear() { m_ObservedObjects.clear(); }; + + Iterator begin() { return m_ObservedObjects.begin(); } + + Iterator end() { return m_ObservedObjects.end(); } + +protected: + ~GraphObservable() + { + if (m_Subject) + { + m_Subject->DetachObservable(this, m_NotifyOnEvent); + } + } + + GraphEvent m_NotifyOnEvent; + Graph* m_Subject; + std::list m_ObservedObjects; +}; + +class AddedLayerObservable : public GraphObservable +{ +public: + explicit AddedLayerObservable(Graph& subject) + : GraphObservable(subject, GraphEvent::LayerAdded) + {}; + + void Update(Layer* graphLayer) override; +}; + +class ErasedLayerNamesObservable : public GraphObservable +{ +public: + explicit ErasedLayerNamesObservable(Graph& subject) + : GraphObservable(subject, GraphEvent::LayerErased) + {}; + + void Update(Layer* graphLayer) override; +}; + +} //namespace armnn + diff --git a/src/armnn/OpenClTimer.cpp b/src/armnn/OpenClTimer.cpp new file mode 100644 index 0000000000..8559fefafd --- /dev/null +++ b/src/armnn/OpenClTimer.cpp @@ -0,0 +1,105 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "OpenClTimer.hpp" + +#include +#include + +namespace armnn +{ + +OpenClTimer::OpenClTimer() +{ +} + +void OpenClTimer::Start() +{ + m_Kernels.clear(); + + auto interceptor = [this]( cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *gwo, + const size_t *gws, + const size_t *lws, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) + { + cl_int retVal = 0; + + // Get the name of the kernel + cl::Kernel retainedKernel(kernel, true); + std::stringstream ss; + ss << retainedKernel.getInfo(); + + // Embed workgroup sizes into the name + if(gws != nullptr) + { + ss << " GWS[" << gws[0] << "," << gws[1] << "," << gws[2] << "]"; + } + if(lws != nullptr) + { + ss << " LWS[" << lws[0] << "," << lws[1] << "," << lws[2] << "]"; + } + + cl_event customEvent; + + // Forward to original OpenCl function + retVal = m_OriginalEnqueueFunction( command_queue, + kernel, + work_dim, + gwo, + gws, + lws, + num_events_in_wait_list, + event_wait_list, + &customEvent); + + // Store the Kernel info for later GetMeasurements() call + m_Kernels.emplace_back(ss.str(), customEvent); + + return retVal; + }; + + m_OriginalEnqueueFunction = CLSymbols::get().clEnqueueNDRangeKernel_ptr; + CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor; +} + +void OpenClTimer::Stop() +{ + CLSymbols::get().clEnqueueNDRangeKernel_ptr = m_OriginalEnqueueFunction; +} + +std::vector OpenClTimer::GetMeasurements() const +{ + std::vector measurements; + + cl_command_queue_properties clQueueProperties = CLScheduler::get().queue().getInfo(); + + int idx = 0; + for (auto& kernel : m_Kernels) + { + std::string name = std::string(this->GetName()) + "/" + std::to_string(idx++) + ": " + kernel.m_Name; + + double timeUs = 0.0; + if((clQueueProperties & CL_QUEUE_PROFILING_ENABLE) != 0) + { + // Wait for the event to finish before accessing profile results. + kernel.m_Event.wait(); + + cl_ulong start = kernel.m_Event.getProfilingInfo(); + cl_ulong end = kernel.m_Event.getProfilingInfo(); + timeUs = static_cast(end - start) / 1000.0; + } + + measurements.emplace_back(name, timeUs, Measurement::Unit::TIME_US); + } + + return measurements; +} + +} //namespace armnn diff --git a/src/armnn/OpenClTimer.hpp b/src/armnn/OpenClTimer.hpp new file mode 100644 index 0000000000..09d7a8b949 --- /dev/null +++ b/src/armnn/OpenClTimer.hpp @@ -0,0 +1,59 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "Instrument.hpp" + +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/core/CL/OpenCL.h" + +#include +#include + +namespace armnn +{ + +/// OpenClTimer instrument that times all OpenCl kernels executed between calls to Start() and Stop(). +class OpenClTimer : public Instrument +{ +public: + OpenClTimer(); + ~OpenClTimer() = default; + + /// Start the OpenCl timer + void Start() override; + + /// Stop the OpenCl timer + void Stop() override; + + /// Get the name of the timer + /// \return Name of the timer + const char* GetName() const override { return "OpenClKernelTimer"; } + + /// Get the recorded measurements. This will be a list of the execution durations for all the OpenCl kernels. + /// \return Recorded measurements + std::vector GetMeasurements() const override; + +private: + using CLScheduler = arm_compute::CLScheduler; + using CLSymbols = arm_compute::CLSymbols; + using ClEvent = cl::Event; + using ClEnqueueFunc = decltype(CLSymbols::clEnqueueNDRangeKernel_ptr); + + /// Stores info about the OpenCl kernel + struct KernelInfo + { + KernelInfo(const std::string& name, cl_event& event) : m_Name(name), m_Event(event) {} + + std::string m_Name; + ClEvent m_Event; + }; + + std::list m_Kernels; ///< List of all kernels executed + ClEnqueueFunc m_OriginalEnqueueFunction; ///< Keep track of original OpenCl function +}; + +} //namespace armnn \ No newline at end of file diff --git a/src/armnn/Optimizer.cpp b/src/armnn/Optimizer.cpp index 9b76c7fa72..630aa1a27b 100644 --- a/src/armnn/Optimizer.cpp +++ b/src/armnn/Optimizer.cpp @@ -3,6 +3,7 @@ // See LICENSE file in the project root for full license information. // #include "Optimizer.hpp" +#include "Observable.hpp" #include "optimizations/All.hpp" namespace armnn @@ -10,44 +11,50 @@ namespace armnn Optimizer::Optimizer() { - // Add optimizations here - static optimizations::SquashEqualPermuteSiblings squashEqualPermuteSiblings; - static optimizations::SquashEqualReshapeSiblings squashEqualReshapeSiblings; - static optimizations::OptimizeInversePermutes optimizeInversePermutes; - static optimizations::MovePermuteUp movePermuteUp; - static optimizations::PermuteAsReshape permuteAsReshape; - static optimizations::OptimizeConsecutiveReshapes optimizeConsecutiveReshapes; - - // Set optimizations in desired order - m_Optimizations = {&squashEqualPermuteSiblings, - &squashEqualReshapeSiblings, - &optimizeInversePermutes, - &movePermuteUp, - &permuteAsReshape, - &optimizeConsecutiveReshapes, - }; } -void Optimizer::Optimize(Graph& graph) +void Optimizer::Pass(Graph& graph, const Optimizations& optimizations) { - Optimizer optimizer; + // Create observables to observe changes to the graph + AddedLayerObservable addedLayerObservable(graph); + ErasedLayerNamesObservable erasedLayerNamesObservable(graph); + + bool graphNeedsSorting = false; auto it = graph.TopologicalSort().end(); - // Call TopologicalSort() in every iteration to re-order the list in case layers where added/removed. + + // Calls TopologicalSort() for every iteration to re-order the list in case layers were added/removed. while (it != graph.TopologicalSort().begin()) { --it; - for (auto&& optimization : optimizer.m_Optimizations) + for (auto&& optimization : optimizations) { optimization->Run(graph, **it); if ((*it)->IsOutputUnconnected()) { it = graph.EraseLayer(it); + graphNeedsSorting = true; + } + + // Add the names of erased layers as related layers to the new added layers + for (auto& erasedLayerName : erasedLayerNamesObservable) + { + for (auto& addedLayer : addedLayerObservable) + { + addedLayer->AddRelatedLayerName(erasedLayerName); + } + } + + erasedLayerNamesObservable.Clear(); + addedLayerObservable.Clear(); + + if (graphNeedsSorting) + { + graphNeedsSorting = false; break; } } } } - } // namespace armnn diff --git a/src/armnn/Optimizer.hpp b/src/armnn/Optimizer.hpp index 1f5ed026fb..06720b040a 100644 --- a/src/armnn/Optimizer.hpp +++ b/src/armnn/Optimizer.hpp @@ -5,25 +5,48 @@ #pragma once #include +#include +#include "optimizations/All.hpp" namespace armnn { -class Graph; -class Optimization; - class Optimizer { public: + using OptimizationPtr = std::unique_ptr; + using Optimizations = std::vector; - static void Optimize(Graph& graph); + static void Pass(Graph& graph, const Optimizations& optimizations); private: ~Optimizer() = default; Optimizer(); +}; + - std::vector m_Optimizations; +template +void Append(Optimizer::Optimizations& optimizations, T&& optimization) +{ + optimizations.emplace_back(new T(optimization)); }; +template +void Append(Optimizer::Optimizations& optimizations, Front&& front, Others&&... others) +{ + Append(optimizations, std::forward(front)); + Append(optimizations, std::forward(others)...); +}; + +template +Optimizer::Optimizations MakeOptimizations(Args&&... args) +{ + Optimizer::Optimizations optimizations; + + Append(optimizations, std::forward(args)...); + + return optimizations; +} + } // namespace armnn diff --git a/src/armnn/Profiling.cpp b/src/armnn/Profiling.cpp index 15a195e6bd..f70f6a34d1 100644 --- a/src/armnn/Profiling.cpp +++ b/src/armnn/Profiling.cpp @@ -3,8 +3,7 @@ // See LICENSE file in the project root for full license information. // #include "Profiling.hpp" - -#if ARMNN_PROFILING_ENABLED +#include "JsonPrinter.hpp" #if ARMNN_STREAMLINE_ENABLED #include @@ -17,10 +16,12 @@ #include #include #include +#include #include #include -#include +#include +#include namespace armnn { @@ -32,86 +33,128 @@ constexpr std::size_t g_ProfilingEventCountHint = 1024; // Whether profiling reports should include the sequence of events together with their timings. constexpr bool g_WriteProfilingEventSequence = true; -// Whether profiling reports should also report detailed information on events grouped by tag. -// This is used to group stats per inference (see usage of ARMNN_UPDATE_PROFILING_EVENT_TAG in -// Runtime::EnqueueWorkload). This can spam the output stream, so use carefully (or adapt -// the code to just output information for a tag of interest). -constexpr bool g_AggregateProfilingEventsByTag = false; +// Whether profiling reports should also report detailed information on events grouped by inference. +// This can spam the output stream, so use carefully (or adapt the code to just output information +// of interest). +constexpr bool g_AggregateProfilingEventsByInference = true; -// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler -// singleton is destroyed. It can be convenient for local tests. -constexpr bool g_WriteReportToStdOutOnProfilerDestruction = true; +// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler is destroyed. +// It can be convenient for local tests. +constexpr bool g_WriteReportToStdOutOnProfilerDestruction = false; // Whether events denoting operations running on the GPU should force a sync before/after the event. // This is hardcoded to true for now as the profiling timings are not very useful without it. +#if ARMCOMPUTECL_ENABLED constexpr bool g_ProfilingForceGpuSync = true; +#endif + +Measurement FindMeasurement(const std::string& name, const Event* event) +{ + + BOOST_ASSERT(event != nullptr); + + // Search though the measurements. + for (const auto& measurement : event->GetMeasurements()) + { + if (measurement.m_Name == name) + { + // Measurement found. + return measurement; + } + } + + // Measurement not found. + return Measurement{ "", 0.f, Measurement::Unit::TIME_MS }; +} + +std::vector FindKernelMeasurements(const Event* event) +{ + BOOST_ASSERT(event != nullptr); + + std::vector measurements; + + // Search through the measurements. + for (const auto& measurement : event->GetMeasurements()) + { + if (measurement.m_Name.rfind("OpenClKernelTimer", 0) == 0 + || measurement.m_Name.rfind("NeonKernelTimer", 0) == 0) + { + // Measurement found. + measurements.push_back(measurement); + } + } + + return measurements; +} std::map Profiler::CalculateProfilingEventStats() const { std::map nameToStatsMap; - for (auto&& event : m_EventSequence) + for (const auto& event : m_EventSequence) { - auto mapIter = nameToStatsMap.find(event.m_Label); - if (mapIter != nameToStatsMap.end()) + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, event.get()); + + double durationMs = measurement.m_Value; + auto it = nameToStatsMap.find(event->GetName()); + if (it != nameToStatsMap.end()) { - ProfilingEventStats& stats = mapIter->second; - stats.m_TotalMs += event.DurationMs(); - stats.m_MinMs = std::min(stats.m_MinMs, event.DurationMs()); - stats.m_MaxMs = std::max(stats.m_MaxMs, event.DurationMs()); + ProfilingEventStats& stats = it->second; + stats.m_TotalMs += durationMs; + stats.m_MinMs = std::min(stats.m_MinMs, durationMs); + stats.m_MaxMs = std::max(stats.m_MaxMs, durationMs); ++stats.m_Count; } else { - ProfilingEventStats stats; - stats.m_TotalMs = event.DurationMs(); - stats.m_MinMs = event.DurationMs(); - stats.m_MaxMs = event.DurationMs(); - stats.m_Count = 1; - - nameToStatsMap[event.m_Label] = stats; + nameToStatsMap.emplace(event->GetName(), ProfilingEventStats{ durationMs, durationMs, durationMs, 1 }); } } return nameToStatsMap; } -void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector::const_iterator first, - std::vector::const_iterator last, - std::ostream& outStream) const +const Event* GetEventPtr(const Event* ptr) { return ptr;} +const Event* GetEventPtr(const std::unique_ptr& ptr) {return ptr.get(); } + +template +void Profiler::AnalyzeEventSequenceAndWriteResults(ItertType first, ItertType last, std::ostream& outStream) const { - // Output event sequence, if needed + // Outputs event sequence, if needed. if (g_WriteProfilingEventSequence) { - // Make sure timestamps are output with 6 decimals, and save old settings + // Makes sure timestamps are output with 6 decimals, and save old settings. std::streamsize oldPrecision = outStream.precision(); outStream.precision(6); std::ios_base::fmtflags oldFlags = outStream.flags(); outStream.setf(std::ios::fixed); - // Output fields + // Outputs fields. outStream << "Event Sequence - Name | Duration (ms) | Start (ms) | Stop (ms) | Device" << std::endl; for (auto event = first; event != last; ++event) { - std::chrono::duration startTimeMs = event->m_StartTime.time_since_epoch(); - std::chrono::duration stopTimeMs = event->m_StopTime.time_since_epoch(); - - outStream << std::setw(50) << event->m_Label << " " - << std::setw(20) << event->DurationMs() - << std::setw(20) << startTimeMs.count() - << std::setw(20) << stopTimeMs.count() - << std::setw(20) << Profiler::Get().GetEventComputeDevice(event->m_Device) - << std::endl; + const Event* eventPtr = GetEventPtr((*event)); + double startTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_START, eventPtr).m_Value; + double stopTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_STOP, eventPtr).m_Value; + + // Find the WallClock measurement if there is one. + double durationMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, eventPtr).m_Value; + outStream << std::setw(50) << eventPtr->GetName() << " " + << std::setw(20) << durationMs + << std::setw(20) << startTimeMs + << std::setw(20) << stopTimeMs + << std::setw(20) << GetComputeDeviceAsCString(eventPtr->GetComputeDevice()) + << std::endl; } outStream << std::endl; - // Restore previous precision settings + // Restores previous precision settings. outStream.flags(oldFlags); outStream.precision(oldPrecision); } - // Aggregate results per event name + // Aggregates results per event name. std::map nameToStatsMap = CalculateProfilingEventStats(); - // Output aggregated stats + // Outputs aggregated stats. outStream << "Event Stats - Name | Avg (ms) | Min (ms) | Max (ms) | Total (ms) | Count" << std::endl; for (const auto& pair : nameToStatsMap) { @@ -126,74 +169,236 @@ void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector:: outStream << std::endl; } -Profiler Profiler::s_Instance; - Profiler::Profiler() - : m_EventTag(0) - , m_NestingLevel(0) - , m_EventTagUpdated(false) + : m_ProfilingEnabled(false) { m_EventSequence.reserve(g_ProfilingEventCountHint); #if ARMNN_STREAMLINE_ENABLED - // Initialise streamline annotations + // Initialises streamline annotations. ANNOTATE_SETUP; #endif } Profiler::~Profiler() { - if (g_WriteReportToStdOutOnProfilerDestruction) + if (m_ProfilingEnabled) { - AnalyzeEventsAndWriteResults(std::cout); + if (g_WriteReportToStdOutOnProfilerDestruction) + { + Print(std::cout); + } } + + // Un-register this profiler from the current thread. + ProfilerManager::GetInstance().RegisterProfiler(nullptr); } -void Profiler::BeginEvent(Compute compute, const std::string label) +bool Profiler::IsProfilingEnabled() +{ + return m_ProfilingEnabled; +} + +void Profiler::EnableProfiling(bool enableProfiling) +{ + m_ProfilingEnabled = enableProfiling; +} + +Event* Profiler::BeginEvent(Compute compute, const std::string& label, std::vector&& instruments) { // We need to sync just before the begin event to not include time before the period we want to time. WaitForDevice(compute); - const TimePoint timeStamp = Clock::now(); - m_ObservedMarkers.emplace(Marker{m_EventSequence.size(), label, timeStamp, compute, m_EventTag}); - m_EventSequence.emplace_back(); + Event* parent = m_Parents.empty() ? nullptr : m_Parents.top(); + m_EventSequence.push_back(std::make_unique(label, this, parent, compute, std::move(instruments))); + Event* event = m_EventSequence.back().get(); + event->Start(); #if ARMNN_STREAMLINE_ENABLED - ANNOTATE_CHANNEL_COLOR(m_NestingLevel, GetEventColor(compute), label.c_str()); + ANNOTATE_CHANNEL_COLOR(m_Parents.size(), GetEventColor(compute), label.c_str()); #endif - m_NestingLevel++; + m_Parents.push(event); + return event; } -void Profiler::EndEvent(Compute compute) +void Profiler::EndEvent(Event* event) { - // We need to sync just before the end event to include all the time of the timed period. - WaitForDevice(compute); - - const Marker& marker = m_ObservedMarkers.top(); + event->Stop(); - const TimePoint startTime = marker.m_TimeStamp; - const TimePoint stopTime = Clock::now(); + BOOST_ASSERT(!m_Parents.empty()); + BOOST_ASSERT(event == m_Parents.top()); + m_Parents.pop(); - m_EventSequence[marker.m_Id] = {std::move(marker.m_EventName), - startTime, - stopTime, - marker.m_ComputeDevice, - marker.m_Tag}; - - m_ObservedMarkers.pop(); + Event* parent = m_Parents.empty() ? nullptr : m_Parents.top(); + boost::ignore_unused(parent); + BOOST_ASSERT(event->GetParentEvent() == parent); #if ARMNN_STREAMLINE_ENABLED - ANNOTATE_CHANNEL_END(m_NestingLevel); + ANNOTATE_CHANNEL_END(m_Parents.size()); #endif +} + +int CalcLevel(const Event* eventPtr) +{ + int level=0; + while (eventPtr != nullptr) + { + eventPtr = eventPtr->GetParentEvent(); + level++; + } + return level; +} + +void Profiler::PopulateInferences(std::vector& outInferences, int& outBaseLevel) const +{ + outInferences.reserve(m_EventSequence.size()); + for (const auto& event : m_EventSequence) + { + const Event* eventPtrRaw = event.get(); + if (eventPtrRaw->GetName() == "EnqueueWorkload") + { + outBaseLevel = (outBaseLevel == -1) ? CalcLevel(eventPtrRaw) : outBaseLevel; + outInferences.push_back(eventPtrRaw); + } + } +} + +void Profiler::PopulateDescendants(std::map>& outDescendantsMap) const +{ + for (const auto& event : m_EventSequence) + { + const Event* eventPtrRaw = event.get(); + const Event* parent = eventPtrRaw->GetParentEvent(); + + if (!parent) + { + continue; + } + + auto it = outDescendantsMap.find(parent); + if (it == outDescendantsMap.end()) + { + outDescendantsMap.emplace(parent, std::vector({eventPtrRaw})); + } + else + { + it->second.push_back(eventPtrRaw); + } + } +} + +void Profiler::Print(std::ostream& outStream) const +{ + // Makes sure timestamps are output with 6 decimals, and save old settings. + std::streamsize oldPrecision = outStream.precision(); + outStream.precision(6); + std::ios_base::fmtflags oldFlags = outStream.flags(); + outStream.setf(std::ios::fixed); + JsonPrinter printer(outStream); + + // First find all the "inference" Events and print out duration measurements. + int baseLevel = -1; + std::vector inferences; + PopulateInferences(inferences, baseLevel); + + // Second map out descendants hierarchy + std::map> descendantsMap; + PopulateDescendants(descendantsMap); + + JsonChildObject inferenceObject{"inference_measurements"}; + JsonChildObject layerObject{"layer_measurements"}; + std::vector workloadObjects; + std::map> workloadToKernelObjects; + + for (unsigned int inferenceIndex = 0; inferenceIndex < inferences.size(); ++inferenceIndex) + { + auto inference = inferences[inferenceIndex]; + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, inference); + inferenceObject.SetUnit(measurement.m_Unit); + inferenceObject.AddMeasurement(measurement.m_Value); + + auto layerEventsIt = descendantsMap.find(inference); + + // Assuming 1 Execute per inference + if (layerEventsIt != descendantsMap.end()) + { + auto layerEvent = layerEventsIt->second[0]; + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, layerEvent); + layerObject.SetUnit(measurement.m_Unit); + layerObject.AddMeasurement(measurement.m_Value); + + // Get Descendant Events for Execute + auto workloadEventsIt = descendantsMap.find(layerEvent); + for(unsigned int workloadIndex = 0; workloadIndex < workloadEventsIt->second.size(); ++workloadIndex) + { + auto workloadEvent = workloadEventsIt->second[workloadIndex]; + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, workloadEvent); + std::vector kernelMeasurements = FindKernelMeasurements(workloadEvent); + if (inferenceIndex == 0) + { + // Only add second level once, in case of multiple inferences + JsonChildObject workloadObject{workloadEvent->GetName()}; + workloadObject.SetUnit(measurement.m_Unit); + workloadObjects.push_back(workloadObject); + } + workloadObjects[workloadIndex].AddMeasurement(measurement.m_Value); + + for(unsigned int kernelIndex = 0; kernelIndex < kernelMeasurements.size(); ++kernelIndex) + { + if (inferenceIndex == 0) + { + // Only add kernel measurement once, in case of multiple inferences + JsonChildObject kernelObject{kernelMeasurements[kernelIndex].m_Name}; + kernelObject.SetUnit(kernelMeasurements[kernelIndex].m_Unit); + workloadToKernelObjects[workloadIndex].push_back(kernelObject); + + } + workloadToKernelObjects[workloadIndex][kernelIndex]. + AddMeasurement(kernelMeasurements[kernelIndex].m_Value); + } + } + } + } + + for (auto workloadToKernelPair : workloadToKernelObjects) + { + for (auto kernelObject : workloadToKernelPair.second) + { + workloadObjects[workloadToKernelPair.first].AddChild(kernelObject); + } + } - m_NestingLevel--; + for (auto workloadObject : workloadObjects) + { + layerObject.AddChild(workloadObject); + } + inferenceObject.AddChild(layerObject); + + printer.PrintHeader(); + printer.PrintArmNNHeader(); + + // print inference object, also prints child layer and kernel measurements + printer.PrintJsonChildObject(inferenceObject); + + // end of ArmNN + printer.PrintNewLine(); + printer.PrintFooter(); + + // end of main JSON object + printer.PrintNewLine(); + printer.PrintFooter(); + printer.PrintNewLine(); + + // Restores previous precision settings. + outStream.flags(oldFlags); + outStream.precision(oldPrecision); } void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const { // Stack should be empty now. - const bool saneMarkerSequence = m_ObservedMarkers.empty(); + const bool saneMarkerSequence = m_Parents.empty(); // Abort if the sequence of markers was found to have incorrect information: // The stats cannot be trusted. @@ -206,39 +411,69 @@ void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const return; } - // Analyze the full sequence of events - AnalyzeEventSequenceAndWriteResults(m_EventSequence.begin(), m_EventSequence.end(), outStream); + // Analyzes the full sequence of events. + AnalyzeEventSequenceAndWriteResults(m_EventSequence.cbegin(), + m_EventSequence.cend(), + outStream); - // Aggregate events by tag if requested (spams the output stream if done for all tags) - if (m_EventTagUpdated && g_AggregateProfilingEventsByTag) + // Aggregates events by tag if requested (spams the output stream if done for all tags). + if (g_AggregateProfilingEventsByInference) { outStream << std::endl; outStream << "***" << std::endl; - outStream << "*** Per Tag Stats" << std::endl; + outStream << "*** Per Inference Stats" << std::endl; outStream << "***" << std::endl; outStream << std::endl; - for (auto iter = m_EventSequence.begin(); iter != m_EventSequence.end();) - { - const uint32_t tag = iter->m_Tag; + int baseLevel = -1; + std::vector inferences; + PopulateInferences(inferences, baseLevel); - // Advance iter until we find the first non-matching tag - auto tagEndIter = iter; - for (; tagEndIter != m_EventSequence.end(); ++tagEndIter) + // Second map out descendants hierarchy + std::map> descendantsMap; + PopulateDescendants(descendantsMap); + + std::function&)> + FindDescendantEvents = [&](const Event* eventPtr, + std::vector& sequence) { - if (tagEndIter->m_Tag != tag) + sequence.push_back(eventPtr); + + if (CalcLevel(eventPtr) > baseLevel+2) //We only care about levels as deep as workload executions. { - break; + return; } - } - outStream << "> Begin Tag: " << tag << std::endl; + auto children = descendantsMap.find(eventPtr); + if (children == descendantsMap.end()) + { + return; + } + + for (const Event* child : children->second) + { + return FindDescendantEvents(child, sequence); + } + }; + + // Third, find events belonging to each inference + int inferenceIdx = 0; + for (auto inference : inferences) + { + std::vector sequence; + + //build sequence, depth first + FindDescendantEvents(inference, sequence); + + outStream << "> Begin Inference: " << inferenceIdx << std::endl; outStream << std::endl; - AnalyzeEventSequenceAndWriteResults(iter, tagEndIter, outStream); + AnalyzeEventSequenceAndWriteResults(sequence.cbegin(), + sequence.cend(), + outStream); outStream << std::endl; - outStream << "> End Tag: " << tag << std::endl; + outStream << "> End Inference: " << inferenceIdx << std::endl; - iter = tagEndIter; + inferenceIdx++; } } } @@ -253,21 +488,6 @@ void Profiler::WaitForDevice(Compute compute) const #endif } -const char* Profiler::GetEventComputeDevice(Compute compute) const -{ - switch(compute) - { - case Compute::CpuRef: - return "CpuRef"; - case Compute::CpuAcc: - return "CpuAcc"; - case Compute::GpuAcc: - return "GpuAcc"; - default: - return "Undefined"; - } -} - std::uint32_t Profiler::GetEventColor(Compute compute) const { switch(compute) @@ -287,7 +507,24 @@ std::uint32_t Profiler::GetEventColor(Compute compute) const } } -} // namespace armnn +// The thread_local pointer to the profiler instance. +thread_local Profiler* tl_Profiler = nullptr; + +ProfilerManager& ProfilerManager::GetInstance() +{ + // Global reference to the single ProfileManager instance allowed. + static ProfilerManager s_ProfilerManager; + return s_ProfilerManager; +} + +void ProfilerManager::RegisterProfiler(Profiler* profiler) +{ + tl_Profiler = profiler; +} -#endif // ARMNN_PROFILING_ENABLED +Profiler* ProfilerManager::GetProfiler() +{ + return tl_Profiler; +} +} // namespace armnn diff --git a/src/armnn/Profiling.hpp b/src/armnn/Profiling.hpp index 88a7adff7c..33c5f46886 100644 --- a/src/armnn/Profiling.hpp +++ b/src/armnn/Profiling.hpp @@ -4,9 +4,12 @@ // #pragma once -#if ARMNN_PROFILING_ENABLED +#include "ProfilingEvent.hpp" #include "armnn/ArmNN.hpp" +#include "armnn/IProfiler.hpp" + +#include "WallClockTimer.hpp" #include #include @@ -15,82 +18,52 @@ #include #include +#include + namespace armnn { -// Clock class that uses the same timestamp function as the Mali DDK -class monotonic_clock { -public: - using duration = std::chrono::nanoseconds; - using time_point = std::chrono::time_point; - - static std::chrono::time_point now() noexcept - { - timespec ts; -#if defined(CLOCK_MONOTONIC_RAW) - clock_gettime(CLOCK_MONOTONIC_RAW, &ts); -#else - clock_gettime(CLOCK_MONOTONIC, &ts); -#endif - return time_point(std::chrono::nanoseconds(ts.tv_sec*1000000000 + ts.tv_nsec)); - } -}; - // Simple single-threaded profiler. // Tracks events reported by BeginEvent()/EndEvent() and outputs detailed information and stats when // Profiler::AnalyzeEventsAndWriteResults() is called. -class Profiler +class Profiler final : public IProfiler { public: + Profiler(); + ~Profiler(); + using InstrumentPtr = std::unique_ptr; + // Marks the beginning of a user-defined event. - // No attempt will be made to copy the name string: It must be known at compile time. - void BeginEvent(Compute compute, const std::string name); + // No attempt will be made to copy the name string: it must be known at compile time. + Event* BeginEvent(Compute compute, const std::string& name, std::vector&& instruments); // Marks the end of a user-defined event. - void EndEvent(Compute compute); + void EndEvent(Event* event); + + // Enables/disables profiling. + void EnableProfiling(bool enableProfiling) override; + + // Checks if profiling is enabled. + bool IsProfilingEnabled() override; // Increments the event tag, allowing grouping of events in a user-defined manner (e.g. per inference). - void UpdateEventTag() { ++m_EventTag; m_EventTagUpdated = true; } + void UpdateEventTag(); // Analyzes the tracked events and writes the results to the given output stream. // Please refer to the configuration variables in Profiling.cpp to customize the information written. - void AnalyzeEventsAndWriteResults(std::ostream& outStream) const; + void AnalyzeEventsAndWriteResults(std::ostream& outStream) const override; - // Accesses the singleton - static Profiler& Get() { return s_Instance; } + // Print stats for events in JSON Format to the given output stream. + void Print(std::ostream& outStream) const override; - // Gets a string name for a given Compute device enum - const char* GetEventComputeDevice(Compute compute) const; - - // Gets the color to render an event with, based on which device it denotes - std::uint32_t GetEventColor(Compute compute) const; - - typedef monotonic_clock Clock; - typedef std::chrono::time_point TimePoint; + // Gets the color to render an event with, based on which device it denotes. + uint32_t GetEventColor(Compute compute) const; private: - + using EventPtr = std::unique_ptr; struct Marker { std::size_t m_Id; - const std::string m_EventName; - TimePoint m_TimeStamp; - Compute m_ComputeDevice; - std::uint32_t m_Tag; - }; - - struct ProfilingEvent - { - std::string m_Label; - TimePoint m_StartTime; - TimePoint m_StopTime; - Compute m_Device; - std::uint32_t m_Tag; - - double DurationMs() const - { - return std::chrono::duration(m_StopTime - m_StartTime).count()*1000.0; - } }; struct ProfilingEventStats @@ -98,62 +71,100 @@ private: double m_TotalMs; double m_MinMs; double m_MaxMs; - std::uint32_t m_Count; + uint32_t m_Count; }; - Profiler(); - ~Profiler(); - // Waits for a compute device to finish working to guarantee correct timings. // Currently used exclusively when emitting profiling events denoting GPU work. void WaitForDevice(Compute compute) const; - void AnalyzeEventSequenceAndWriteResults(std::vector::const_iterator first, - std::vector::const_iterator last, - std::ostream& outStream) const; + template + void AnalyzeEventSequenceAndWriteResults(EventIterType first, EventIterType last, std::ostream& outStream) const; std::map CalculateProfilingEventStats() const; + void PopulateInferences(std::vector& outInferences, int& outBaseLevel) const; + void PopulateDescendants(std::map>& outDescendantsMap) const; - std::stack m_ObservedMarkers; - std::vector m_EventSequence; - std::uint32_t m_EventTag; - std::uint32_t m_NestingLevel; - bool m_EventTagUpdated; + std::stack m_Parents; + std::vector m_EventSequence; + bool m_ProfilingEnabled; - static Profiler s_Instance; +private: + // Friend functions for unit testing, see ProfilerTests.cpp. + friend size_t GetProfilerEventSequenceSize(armnn::Profiler* profiler); }; -// Helper to easily add event markers to the codebase +// Singleton profiler manager. +// Keeps track of all the running profiler instances. +class ProfilerManager +{ +public: + // Register the given profiler as a thread local pointer. + void RegisterProfiler(Profiler* profiler); + + // Gets the thread local pointer to the profiler. + Profiler* GetProfiler(); + + // Accesses the singleton. + static ProfilerManager& GetInstance(); + +private: + // The constructor is kept private so that other instances of this class (other that the singleton's) + // can't be allocated. + ProfilerManager() {} +}; + +// Helper to easily add event markers to the codebase. class ScopedProfilingEvent { public: - ScopedProfilingEvent(Compute compute, const std::string name) - : m_Compute(compute) + using InstrumentPtr = std::unique_ptr; + + template + ScopedProfilingEvent(Compute compute, const std::string& name, Args... args) + : m_Event(nullptr) + , m_Profiler(ProfilerManager::GetInstance().GetProfiler()) { - Profiler::Get().BeginEvent(compute, name); + if (m_Profiler && m_Profiler->IsProfilingEnabled()) + { + std::vector instruments(0); + instruments.reserve(sizeof...(args)); //One allocation + ConstructNextInVector(instruments, args...); + m_Event = m_Profiler->BeginEvent(compute, name, std::move(instruments)); + } } ~ScopedProfilingEvent() { - Profiler::Get().EndEvent(m_Compute); + if (m_Profiler && m_Event) + { + m_Profiler->EndEvent(m_Event); + } } private: - armnn::Compute m_Compute; -}; - -} // namespace armnn -// Allows grouping events in an user-defined manner (e.g. per inference) -#define ARMNN_UPDATE_PROFILING_EVENT_TAG() armnn::Profiler::Get().UpdateEventTag(); + void ConstructNextInVector(std::vector& instruments) + { + boost::ignore_unused(instruments); + } -// The event name must be known at compile time -#define ARMNN_SCOPED_PROFILING_EVENT(compute, name) armnn::ScopedProfilingEvent e_##__FILE__##__LINE__(compute, name); + template + void ConstructNextInVector(std::vector& instruments, Arg arg, Args... args) + { + instruments.emplace_back(std::make_unique(arg)); + ConstructNextInVector(instruments, args...); + } -#else + Event* m_Event; ///< Event to track + Profiler* m_Profiler; ///< Profiler used +}; -#define ARMNN_UPDATE_PROFILING_EVENT_TAG() -#define ARMNN_SCOPED_PROFILING_EVENT(compute, name) +} // namespace armnn -#endif // ARMNN_PROFILING_ENABLED +// The event name must be known at compile time +#define ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(compute, /*name,*/ ...) \ + armnn::ScopedProfilingEvent e_##__FILE__##__LINE__(compute, /*name,*/ __VA_ARGS__); +#define ARMNN_SCOPED_PROFILING_EVENT(compute, name) \ + ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(compute, name, armnn::WallClockTimer()) diff --git a/src/armnn/ProfilingEvent.cpp b/src/armnn/ProfilingEvent.cpp new file mode 100644 index 0000000000..42a44a7280 --- /dev/null +++ b/src/armnn/ProfilingEvent.cpp @@ -0,0 +1,103 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "Profiling.hpp" +#include "ProfilingEvent.hpp" + +namespace armnn +{ +Event::Event(const std::string& eventName, + Profiler* profiler, + Event* parent, + const Compute computeDevice, + std::vector&& instruments) + : m_EventName(eventName) + , m_Profiler(profiler) + , m_Parent(parent) + , m_ComputeDevice(computeDevice) + , m_Instruments(std::move(instruments)) +{ +} + +Event::Event(Event&& other) noexcept + : m_EventName(std::move(other.m_EventName)) + , m_Profiler(other.m_Profiler) + , m_Parent(other.m_Parent) + , m_ComputeDevice(other.m_ComputeDevice) + , m_Instruments(std::move(other.m_Instruments)) + +{ +} + +Event::~Event() noexcept +{ +} + +void Event::Start() +{ + for (auto& instrument : m_Instruments) + { + instrument->Start(); + } +} + +void Event::Stop() +{ + for (auto& instrument : m_Instruments) + { + instrument->Stop(); + } +} + +const std::vector Event::GetMeasurements() const +{ + std::vector measurements; + for (auto& instrument : m_Instruments) + { + for (auto& measurement : instrument->GetMeasurements()) + { + measurements.emplace_back(std::move(measurement)); + } + } + return measurements; +} + +const std::string& Event::GetName() const +{ + return m_EventName; +} + +const Profiler* Event::GetProfiler() const +{ + return m_Profiler; +} + +const Event* Event::GetParentEvent() const +{ + return m_Parent; +} + +Compute Event::GetComputeDevice() const +{ + return m_ComputeDevice; +} + +Event& Event::operator=(Event&& other) noexcept +{ + if (this == &other) + { + return *this; + } + + m_EventName = other.m_EventName; + m_Profiler = other.m_Profiler; + m_Parent = other.m_Parent; + m_ComputeDevice = other.m_ComputeDevice; + other.m_Profiler = nullptr; + other.m_Parent = nullptr; + return *this; +} + +} // namespace armnn diff --git a/src/armnn/ProfilingEvent.hpp b/src/armnn/ProfilingEvent.hpp new file mode 100644 index 0000000000..61a2ee99e3 --- /dev/null +++ b/src/armnn/ProfilingEvent.hpp @@ -0,0 +1,92 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include +#include +#include +#include +#include "Instrument.hpp" +#include "armnn/Types.hpp" + +namespace armnn +{ + +/// Forward declaration +class Profiler; + +/// Event class records measurements reported by BeginEvent()/EndEvent() and returns measurements when +/// Event::GetMeasurements() is called. +class Event +{ +public: + using InstrumentPtr = std::unique_ptr; + using Instruments = std::vector; + + Event(const std::string& eventName, + Profiler* profiler, + Event* parent, + const Compute computeDevice, + std::vector&& instrument); + + Event(const Event& other) = delete; + + /// Move Constructor + Event(Event&& other) noexcept; + + /// Destructor + ~Event() noexcept; + + /// Start the Event + void Start(); + + /// Stop the Event + void Stop(); + + /// Get the recorded measurements calculated between Start() and Stop() + /// \return Recorded measurements of the event + const std::vector GetMeasurements() const; + + /// Get the name of the event + /// \return Name of the event + const std::string& GetName() const; + + /// Get the pointer of the profiler associated with this event + /// \return Pointer of the profiler associated with this event + const Profiler* GetProfiler() const; + + /// Get the pointer of the parent event + /// \return Pointer of the parent event + const Event* GetParentEvent() const; + + /// Get the compute device of the event + /// \return Compute device of the event + Compute GetComputeDevice() const; + + /// Assignment operator + Event& operator=(const Event& other) = delete; + + /// Move Assignment operator + Event& operator=(Event&& other) noexcept; + +private: + /// Name of the event + std::string m_EventName; + + /// Stored associated profiler + Profiler* m_Profiler; + + /// Stores optional parent event + Event* m_Parent; + + /// Compute device + Compute m_ComputeDevice; + + /// Instruments to use + Instruments m_Instruments; +}; + +} // namespace armnn diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp index 0ca3446e1b..7d1a9faaea 100644 --- a/src/armnn/Runtime.cpp +++ b/src/armnn/Runtime.cpp @@ -44,23 +44,33 @@ int Runtime::GenerateNetworkId() } Status Runtime::LoadNetwork(NetworkId& networkIdOut, IOptimizedNetworkPtr inNetwork) +{ + std::string ignoredErrorMessage; + return LoadNetwork(networkIdOut, std::move(inNetwork), ignoredErrorMessage); +} + +Status Runtime::LoadNetwork(NetworkId& networkIdOut, + IOptimizedNetworkPtr inNetwork, + std::string & errorMessage) { IOptimizedNetwork* rawNetwork = inNetwork.release(); unique_ptr loadedNetwork = LoadedNetwork::MakeLoadedNetwork( std::unique_ptr(boost::polymorphic_downcast(rawNetwork)), - m_UseCpuRefAsFallback); + errorMessage); if (!loadedNetwork) { return Status::Failure; } - std::lock_guard lockGuard(m_Mutex); - networkIdOut = GenerateNetworkId(); - // store the network - m_LoadedNetworks[networkIdOut] = std::move(loadedNetwork); + { + std::lock_guard lockGuard(m_Mutex); + + // Stores the network + m_LoadedNetworks[networkIdOut] = std::move(loadedNetwork); + } return Status::Success; } @@ -70,7 +80,7 @@ Status Runtime::UnloadNetwork(NetworkId networkId) #ifdef ARMCOMPUTECL_ENABLED if (arm_compute::CLScheduler::get().context()() != NULL) { - // wait for all queued CL requests to finish before unloading the network they may be using + // Waits for all queued CL requests to finish before unloading the network they may be using. try { // Coverity fix: arm_compute::CLScheduler::sync() may throw an exception of type cl::Error. @@ -84,36 +94,55 @@ Status Runtime::UnloadNetwork(NetworkId networkId) } } #endif - std::lock_guard lockGuard(m_Mutex); - if (m_LoadedNetworks.erase(networkId) == 0) { - BOOST_LOG_TRIVIAL(warning) << "WARNING: Runtime::UnloadNetwork(): " << networkId << " not found!"; - return Status::Failure; - } + std::lock_guard lockGuard(m_Mutex); + + if (m_LoadedNetworks.erase(networkId) == 0) + { + BOOST_LOG_TRIVIAL(warning) << "WARNING: Runtime::UnloadNetwork(): " << networkId << " not found!"; + return Status::Failure; + } + #ifdef ARMCOMPUTECL_ENABLED - if (arm_compute::CLScheduler::get().context()() != NULL && m_LoadedNetworks.empty()) - { - // There are no loaded networks left, so clear the CL cache to free up memory - m_ClContextControl.ClearClCache(); - } + if (arm_compute::CLScheduler::get().context()() != NULL && m_LoadedNetworks.empty()) + { + // There are no loaded networks left, so clear the CL cache to free up memory + m_ClContextControl.ClearClCache(); + } #endif + } + BOOST_LOG_TRIVIAL(debug) << "Runtime::UnloadNetwork(): Unloaded network with ID: " << networkId; return Status::Success; } +const std::shared_ptr Runtime::GetProfiler(NetworkId networkId) const +{ + auto it = m_LoadedNetworks.find(networkId); + if (it != m_LoadedNetworks.end()) + { + auto& loadedNetwork = it->second; + return loadedNetwork->GetProfiler(); + } + + return nullptr; +} + Runtime::Runtime(const CreationOptions& options) - : m_ClContextControl(options.m_ClTunedParameters) + : m_ClContextControl(options.m_GpuAccTunedParameters.get(), + options.m_EnableGpuProfiling) , m_NetworkIdCounter(0) { BOOST_LOG_TRIVIAL(info) << "ArmNN v" << ARMNN_VERSION << "\n"; - BOOST_LOG_TRIVIAL(info) << "Using compute device: " << options.m_DefaultComputeDevice << "\n"; - m_DeviceSpec.DefaultComputeDevice = options.m_DefaultComputeDevice; - // If useCpuRefAsFallback is false, the reference workload factory will be prevented from creating - // operation workloads, unless the default compute device is precisely the reference backend. - // This option is passed to the LoadedNetwork, which owns the workload factories. - m_UseCpuRefAsFallback = options.m_DefaultComputeDevice == Compute::CpuRef || options.m_UseCpuRefAsFallback; + m_DeviceSpec.m_SupportedComputeDevices.insert(armnn::Compute::CpuRef); + #if ARMCOMPUTECL_ENABLED + m_DeviceSpec.m_SupportedComputeDevices.insert(armnn::Compute::GpuAcc); + #endif + #if ARMCOMPUTENEON_ENABLED + m_DeviceSpec.m_SupportedComputeDevices.insert(armnn::Compute::CpuAcc); + #endif } Runtime::~Runtime() @@ -173,8 +202,8 @@ TensorInfo Runtime::GetOutputTensorInfo(NetworkId networkId, LayerBindingId laye } Status Runtime::EnqueueWorkload(NetworkId networkId, - const InputTensors& inputTensors, - const OutputTensors& outputTensors) + const InputTensors& inputTensors, + const OutputTensors& outputTensors) { LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId); return loadedNetwork->EnqueueWorkload(inputTensors, outputTensors); diff --git a/src/armnn/Runtime.hpp b/src/armnn/Runtime.hpp index 3879e1dd52..151dde3588 100644 --- a/src/armnn/Runtime.hpp +++ b/src/armnn/Runtime.hpp @@ -5,6 +5,7 @@ #pragma once #include "LoadedNetwork.hpp" +#include "DeviceSpec.hpp" #include "armnn/INetwork.hpp" #include "armnn/IRuntime.hpp" #include "armnn/Tensor.hpp" @@ -19,29 +20,44 @@ namespace armnn class Runtime final : public IRuntime { public: - /// Load a complete network into the Runtime. - /// @param [out] networkIdOut Unique identifier for the network is returned in this reference. - /// @param [in] network Complete network to load into the Runtime. + /// Loads a complete network into the Runtime. + /// @param [out] networkIdOut - Unique identifier for the network is returned in this reference. + /// @param [in] network - Complete network to load into the Runtime. /// The runtime takes ownership of the network once passed in. /// @return armnn::Status virtual Status LoadNetwork(NetworkId& networkIdOut, IOptimizedNetworkPtr network) override; + /// Load a complete network into the IRuntime. + /// @param [out] networkIdOut Unique identifier for the network is returned in this reference. + /// @param [in] network Complete network to load into the IRuntime. + /// @param [out] errorMessage Error message if there were any errors. + /// The runtime takes ownership of the network once passed in. + /// @return armnn::Status + virtual Status LoadNetwork(NetworkId& networkIdOut, + IOptimizedNetworkPtr network, + std::string & errorMessage) override; + virtual TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const override; virtual TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const override; - // Evaluate network using input in inputTensors, outputs filled into outputTensors + // Evaluates network using input in inputTensors, outputs filled into outputTensors. virtual Status EnqueueWorkload(NetworkId networkId, const InputTensors& inputTensors, const OutputTensors& outputTensors) override; - /// Unload a network from the Runtime. + /// Unloads a network from the Runtime. /// At the moment this only removes the network from the m_Impl->m_Network. /// This might need more work in the future to be AndroidNN compliant. /// @param [in] networkId Unique identifier for the network to be unloaded. Generated in LoadNetwork(). /// @return armnn::Status virtual Status UnloadNetwork(NetworkId networkId) override; - virtual const DeviceSpec& GetDeviceSpec() const override { return m_DeviceSpec; } + virtual const IDeviceSpec& GetDeviceSpec() const override { return m_DeviceSpec; } + + /// Gets the profiler corresponding to the given network id. + /// @param networkId The id of the network for which to get the profile. + /// @return A pointer to the requested profiler, or nullptr if not found. + virtual const std::shared_ptr GetProfiler(NetworkId networkId) const override; /// Creates a runtime for workload execution. /// May throw a ClRuntimeUnavailableException if @a defaultComputeDevice requires a CL runtime but @@ -51,7 +67,7 @@ public: ~Runtime(); private: - friend void RuntimeLoadedNetworksReserve(armnn::Runtime* runtime); // see RuntimeTests.cpp + friend void RuntimeLoadedNetworksReserve(armnn::Runtime* runtime); // See RuntimeTests.cpp int GenerateNetworkId(); @@ -65,8 +81,6 @@ private: int m_NetworkIdCounter; - bool m_UseCpuRefAsFallback; - DeviceSpec m_DeviceSpec; }; diff --git a/src/armnn/Tensor.cpp b/src/armnn/Tensor.cpp index 2e04c8c617..e5d7f4b1b8 100644 --- a/src/armnn/Tensor.cpp +++ b/src/armnn/Tensor.cpp @@ -180,7 +180,7 @@ BaseTensor& BaseTensor::operator =(const BaseTensor; template class BaseTensor; diff --git a/src/armnn/TypeUtils.hpp b/src/armnn/TypeUtils.hpp new file mode 100644 index 0000000000..2b70e28ff3 --- /dev/null +++ b/src/armnn/TypeUtils.hpp @@ -0,0 +1,40 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "armnn/Types.hpp" +#include "Half.hpp" + +namespace armnn +{ + + +template +struct ResolveTypeImpl; + +template<> +struct ResolveTypeImpl +{ + using Type = uint8_t; +}; + +template <> +struct ResolveTypeImpl +{ + using Type = Half; +}; + +template<> +struct ResolveTypeImpl +{ + using Type = float; +}; + +template +using ResolveType = typename ResolveTypeImpl
::Type; + + +} //namespace armnn \ No newline at end of file diff --git a/src/armnn/Utils.cpp b/src/armnn/Utils.cpp index fbde701a2a..5dafe54d7a 100644 --- a/src/armnn/Utils.cpp +++ b/src/armnn/Utils.cpp @@ -15,7 +15,7 @@ void ConfigureLogging(bool printToStandardOutput, bool printToDebugOutput, LogSe ConfigureLogging(boost::log::core::get().get(), printToStandardOutput, printToDebugOutput, severity); } -// Default to logging completely disabled. +// Defaults to logging completely disabled. // The user of the library must enable it if they want by calling armnn::ConfigureLogging(). struct DefaultLoggingConfiguration { @@ -27,4 +27,4 @@ struct DefaultLoggingConfiguration static DefaultLoggingConfiguration g_DefaultLoggingConfiguration; -} \ No newline at end of file +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/WallClockTimer.cpp b/src/armnn/WallClockTimer.cpp new file mode 100644 index 0000000000..93d12222f7 --- /dev/null +++ b/src/armnn/WallClockTimer.cpp @@ -0,0 +1,41 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "WallClockTimer.hpp" + +namespace armnn +{ + +const std::string WallClockTimer::WALL_CLOCK_TIME ("Wall clock time"); +const std::string WallClockTimer::WALL_CLOCK_TIME_START(WallClockTimer::WALL_CLOCK_TIME + " (Start)"); +const std::string WallClockTimer::WALL_CLOCK_TIME_STOP (WallClockTimer::WALL_CLOCK_TIME + " (Stop)"); + +const char* WallClockTimer::GetName() const +{ + return "WallClockTimer"; +} + +void WallClockTimer::Start() +{ + m_Start = clock::now(); +} + +void WallClockTimer::Stop() +{ + m_Stop = clock::now(); +} + +std::vector WallClockTimer::GetMeasurements() const +{ + const auto delta = std::chrono::duration(m_Stop - m_Start); + const auto startTimeMs = std::chrono::duration(m_Start.time_since_epoch()); + const auto stopTimeMs = std::chrono::duration(m_Stop.time_since_epoch()); + + return { { WALL_CLOCK_TIME, delta.count(), Measurement::Unit::TIME_MS }, + { WALL_CLOCK_TIME_START, startTimeMs.count(), Measurement::Unit::TIME_MS }, + { WALL_CLOCK_TIME_STOP, stopTimeMs.count(), Measurement::Unit::TIME_MS } }; +} + +} //namespace armnn diff --git a/src/armnn/WallClockTimer.hpp b/src/armnn/WallClockTimer.hpp new file mode 100644 index 0000000000..84b46da8a2 --- /dev/null +++ b/src/armnn/WallClockTimer.hpp @@ -0,0 +1,63 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "Instrument.hpp" +#include + +namespace armnn +{ + +// Clock class that uses the same timestamp function as the Mali DDK. +class monotonic_clock_raw { +public: + using duration = std::chrono::nanoseconds; + using time_point = std::chrono::time_point; + + static std::chrono::time_point now() noexcept + { + timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return time_point(std::chrono::nanoseconds(ts.tv_sec*1000000000 + ts.tv_nsec)); + } +}; + +// Implementation of an instrument to measure elapsed wall-clock time in milliseconds. +class WallClockTimer : public Instrument +{ +public: + // Construct a Wall Clock Timer + WallClockTimer() = default; + ~WallClockTimer() = default; + + // Start the Wall clock timer + void Start() override; + + // Stop the Wall clock timer + void Stop() override; + + // Get the name of the timer + const char* GetName() const override; + + // Get the recorded measurements + std::vector GetMeasurements() const override; + +#if defined(CLOCK_MONOTONIC_RAW) + using clock = monotonic_clock_raw; +#else + using clock = std::chrono::steady_clock; +#endif + + static const std::string WALL_CLOCK_TIME; + static const std::string WALL_CLOCK_TIME_START; + static const std::string WALL_CLOCK_TIME_STOP; + +private: + clock::time_point m_Start; + clock::time_point m_Stop; +}; + +} //namespace armnn diff --git a/src/armnn/backends/AclBaseMemoryManager.cpp b/src/armnn/backends/AclBaseMemoryManager.cpp deleted file mode 100644 index fc796995c7..0000000000 --- a/src/armnn/backends/AclBaseMemoryManager.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// -// Copyright © 2017 Arm Ltd. All rights reserved. -// See LICENSE file in the project root for full license information. -// -#include "AclBaseMemoryManager.hpp" - -namespace armnn -{ - -#if ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED -AclBaseMemoryManager::AclBaseMemoryManager(std::unique_ptr alloc) -{ - // (re)create the memory manager components - m_Allocator = std::move(alloc); - m_IntraLayerLifetimeMgr = std::make_shared(); - m_IntraLayerPoolMgr = std::make_shared(); - m_IntraLayerMemoryMgr = std::make_shared(m_IntraLayerLifetimeMgr, - m_IntraLayerPoolMgr); -} - -void AclBaseMemoryManager::Finalize() -{ - // Set allocator that the memory manager will use - m_IntraLayerMemoryMgr->set_allocator(m_Allocator.get()); - // Number of pools that the manager will create. This specifies how many layers you want to run in parallel - m_IntraLayerMemoryMgr->set_num_pools(1); - // Finalize the memory manager. (Validity checks, memory allocations, etc) - m_IntraLayerMemoryMgr->finalize(); -} -#endif - -} diff --git a/src/armnn/backends/AclBaseMemoryManager.hpp b/src/armnn/backends/AclBaseMemoryManager.hpp deleted file mode 100644 index 74b596fe97..0000000000 --- a/src/armnn/backends/AclBaseMemoryManager.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// -// Copyright © 2017 Arm Ltd. All rights reserved. -// See LICENSE file in the project root for full license information. -// -#pragma once - -#include "WorkloadFactory.hpp" - -#if ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED -#include "arm_compute/runtime/IAllocator.h" -#include "arm_compute/runtime/BlobLifetimeManager.h" -#include "arm_compute/runtime/MemoryManagerOnDemand.h" -#include "arm_compute/runtime/PoolManager.h" - -#include -#endif - -namespace armnn -{ - -// ARM Compute Base Memory Manager -class AclBaseMemoryManager -{ -public: - - AclBaseMemoryManager() { } - virtual ~AclBaseMemoryManager() { } - -#if ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED - AclBaseMemoryManager(std::unique_ptr alloc); - - void Finalize(); - - std::shared_ptr& Get() { return m_IntraLayerMemoryMgr; } - -protected: - - mutable std::unique_ptr m_Allocator; - mutable std::shared_ptr m_IntraLayerLifetimeMgr; - mutable std::shared_ptr m_IntraLayerPoolMgr; - mutable std::shared_ptr m_IntraLayerMemoryMgr; -#endif - -}; - -} //namespace armnn diff --git a/src/armnn/backends/ArmComputeTensorUtils.cpp b/src/armnn/backends/ArmComputeTensorUtils.cpp index f88ed2b4c3..8e4abaf67a 100644 --- a/src/armnn/backends/ArmComputeTensorUtils.cpp +++ b/src/armnn/backends/ArmComputeTensorUtils.cpp @@ -16,23 +16,17 @@ arm_compute::DataType GetArmComputeDataType(armnn::DataType dataType) { switch(dataType) { + case armnn::DataType::Float16: + return arm_compute::DataType::F16; case armnn::DataType::Float32: - { return arm_compute::DataType::F32; - } case armnn::DataType::QuantisedAsymm8: - { return arm_compute::DataType::QASYMM8; - } case armnn::DataType::Signed32: - { return arm_compute::DataType::S32; - } default: - { BOOST_ASSERT_MSG(false, "Unknown data type"); return arm_compute::DataType::UNKNOWN; - } } } @@ -40,15 +34,15 @@ arm_compute::TensorShape BuildArmComputeTensorShape(const armnn::TensorShape& te { arm_compute::TensorShape shape; - // armnn tensors are (batch, channels, height, width) - // arm_compute tensors are (width, height, channels, batch) + // armnn tensors are (batch, channels, height, width). + // arm_compute tensors are (width, height, channels, batch). for (unsigned int i = 0; i < tensorShape.GetNumDimensions(); i++) { - // note that our dimensions are stored in the opposite order to ACL's + // Note that our dimensions are stored in the opposite order to ACL's. shape.set(tensorShape.GetNumDimensions() - i - 1, tensorShape[i]); // TensorShape::set() flattens leading ones, so that batch size 1 cannot happen. - // arm_compute tensors expect this + // arm_compute tensors expect this. } // prevent arm_compute issue where tensor is flattened to nothing @@ -80,11 +74,18 @@ arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDes using arm_compute::PoolingLayerInfo; using arm_compute::Size2D; - // Resolve ARM Compute layer parameters + // Resolve ARM Compute layer parameters. const PoolingType poolingType = ConvertPoolingAlgorithmToAclPoolingType(descriptor.m_PoolType); + + bool isGlobalPooling = (descriptor.m_StrideX==0 && descriptor.m_StrideY==0); + //use specific constructor if global pooling + if(isGlobalPooling) + { + return arm_compute::PoolingLayerInfo(poolingType); + } + const DimensionRoundingType rounding = ConvertOutputShapeRoundingToAclDimensionRoundingType( descriptor.m_OutputShapeRounding); - const PadStrideInfo padStrideInfo(descriptor.m_StrideX, descriptor.m_StrideY, descriptor.m_PadLeft, diff --git a/src/armnn/backends/ArmComputeTensorUtils.hpp b/src/armnn/backends/ArmComputeTensorUtils.hpp index 84547f9c80..81c6620a01 100644 --- a/src/armnn/backends/ArmComputeTensorUtils.hpp +++ b/src/armnn/backends/ArmComputeTensorUtils.hpp @@ -20,26 +20,26 @@ class ITensorHandle; namespace armcomputetensorutils { -/// Utility function to map an armnn::DataType to corresponding arm_compute::DataType +/// Utility function to map an armnn::DataType to corresponding arm_compute::DataType. arm_compute::DataType GetArmComputeDataType(armnn::DataType dataType); -/// Utility function used to setup an arm_compute::TensorShape object from an armnn::TensorShape +/// Utility function used to setup an arm_compute::TensorShape object from an armnn::TensorShape. arm_compute::TensorShape BuildArmComputeTensorShape(const armnn::TensorShape& tensorShape); /// Utility function used to setup an arm_compute::ITensorInfo object whose dimensions are based on the given -/// armnn::ITensorInfo +/// armnn::ITensorInfo. arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo); -/// Utility function used to setup an arm_compute::PoolingLayerInfo object from an armnn::Pooling2dDescriptor +/// Utility function used to setup an arm_compute::PoolingLayerInfo object from an armnn::Pooling2dDescriptor. arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDescriptor& descriptor); -/// Utility function to setup an arm_compute::NormalizationLayerInfo object from an armnn::NormalizationDescriptor +/// Utility function to setup an arm_compute::NormalizationLayerInfo object from an armnn::NormalizationDescriptor. arm_compute::NormalizationLayerInfo BuildArmComputeNormalizationLayerInfo(const NormalizationDescriptor& desc); -/// Utility function used to setup an arm_compute::PermutationVector object from an armnn::PermutationVector +/// Utility function used to setup an arm_compute::PermutationVector object from an armnn::PermutationVector. arm_compute::PermutationVector BuildArmComputePermutationVector(const armnn::PermutationVector& vector); -/// Utility function used to setup an arm_compute::PadStrideInfo object from an armnn layer descriptor +/// Utility function used to setup an arm_compute::PadStrideInfo object from an armnn layer descriptor. template arm_compute::PadStrideInfo BuildArmComputePadStrideInfo(const Descriptor &descriptor) { @@ -65,6 +65,16 @@ void InitialiseArmComputeTensorEmpty(Tensor& tensor) tensor.allocator()->allocate(); } +/// Utility function to free unused tensors after a workload is configured and prepared +template +void FreeTensorIfUnused(std::unique_ptr& tensor) +{ + if (tensor && !tensor->is_used()) + { + tensor.reset(nullptr); + } +} + // Helper function to obtain byte offset into tensor data inline size_t GetTensorOffset(const arm_compute::ITensorInfo& info, uint32_t batchIndex, @@ -73,14 +83,14 @@ inline size_t GetTensorOffset(const arm_compute::ITensorInfo& info, uint32_t x) { arm_compute::Coordinates coords; - coords.set(3, boost::numeric_cast(batchIndex)); - coords.set(2, boost::numeric_cast(channelIndex)); - coords.set(1, boost::numeric_cast(y)); - coords.set(0, boost::numeric_cast(x)); + coords.set(3, static_cast(batchIndex)); + coords.set(2, static_cast(channelIndex)); + coords.set(1, static_cast(y)); + coords.set(0, static_cast(x)); return info.offset_element_in_bytes(coords); } -// Helper function to obtain element offset into data buffer representing tensor data (assuming no strides) +// Helper function to obtain element offset into data buffer representing tensor data (assuming no strides). inline size_t GetLinearBufferOffset(const arm_compute::ITensorInfo& info, uint32_t batchIndex, uint32_t channelIndex, @@ -88,25 +98,25 @@ inline size_t GetLinearBufferOffset(const arm_compute::ITensorInfo& info, uint32_t x) { const arm_compute::TensorShape& shape = info.tensor_shape(); - uint32_t width = boost::numeric_cast(shape[0]); - uint32_t height = boost::numeric_cast(shape[1]); - uint32_t numChannels = boost::numeric_cast(shape[2]); + uint32_t width = static_cast(shape[0]); + uint32_t height = static_cast(shape[1]); + uint32_t numChannels = static_cast(shape[2]); return ((batchIndex * numChannels + channelIndex) * height + y) * width + x; } template void CopyArmComputeITensorData(const arm_compute::ITensor& srcTensor, T* dstData) { - // if MaxNumOfTensorDimensions is increased, this loop will need fixing + // If MaxNumOfTensorDimensions is increased, this loop will need fixing. static_assert(MaxNumOfTensorDimensions == 4, "Please update CopyArmComputeITensorData"); { const arm_compute::ITensorInfo& info = *srcTensor.info(); const arm_compute::TensorShape& shape = info.tensor_shape(); const uint8_t* const bufferPtr = srcTensor.buffer(); - uint32_t width = boost::numeric_cast(shape[0]); - uint32_t height = boost::numeric_cast(shape[1]); - uint32_t numChannels = boost::numeric_cast(shape[2]); - uint32_t numBatches = boost::numeric_cast(shape[3]); + uint32_t width = static_cast(shape[0]); + uint32_t height = static_cast(shape[1]); + uint32_t numChannels = static_cast(shape[2]); + uint32_t numBatches = static_cast(shape[3]); for (unsigned int batchIndex = 0; batchIndex < numBatches; ++batchIndex) { @@ -114,8 +124,8 @@ void CopyArmComputeITensorData(const arm_compute::ITensor& srcTensor, T* dstData { for (unsigned int y = 0; y < height; ++y) { - // Copy one row from arm_compute tensor buffer to linear memory buffer - // A row is the largest contiguous region we can copy, as the tensor data may be using strides + // Copies one row from arm_compute tensor buffer to linear memory buffer. + // A row is the largest contiguous region we can copy, as the tensor data may be using strides. memcpy(dstData + GetLinearBufferOffset(info, batchIndex, channelIndex, y, 0), bufferPtr + GetTensorOffset(info, batchIndex, channelIndex, y, 0), width * sizeof(T)); @@ -128,16 +138,16 @@ void CopyArmComputeITensorData(const arm_compute::ITensor& srcTensor, T* dstData template void CopyArmComputeITensorData(const T* srcData, arm_compute::ITensor& dstTensor) { - // if MaxNumOfTensorDimensions is increased, this loop will need fixing + // If MaxNumOfTensorDimensions is increased, this loop will need fixing. static_assert(MaxNumOfTensorDimensions == 4, "Please update CopyArmComputeITensorData"); { const arm_compute::ITensorInfo& info = *dstTensor.info(); const arm_compute::TensorShape& shape = info.tensor_shape(); uint8_t* const bufferPtr = dstTensor.buffer(); - uint32_t width = boost::numeric_cast(shape[0]); - uint32_t height = boost::numeric_cast(shape[1]); - uint32_t numChannels = boost::numeric_cast(shape[2]); - uint32_t numBatches = boost::numeric_cast(shape[3]); + uint32_t width = static_cast(shape[0]); + uint32_t height = static_cast(shape[1]); + uint32_t numChannels = static_cast(shape[2]); + uint32_t numBatches = static_cast(shape[3]); for (unsigned int batchIndex = 0; batchIndex < numBatches; ++batchIndex) { @@ -145,8 +155,8 @@ void CopyArmComputeITensorData(const T* srcData, arm_compute::ITensor& dstTensor { for (unsigned int y = 0; y < height; ++y) { - // Copy one row from linear memory buffer to arm_compute tensor buffer - // A row is the largest contiguous region we can copy, as the tensor data may be using strides + // Copies one row from linear memory buffer to arm_compute tensor buffer. + // A row is the largest contiguous region we can copy, as the tensor data may be using strides. memcpy(bufferPtr + GetTensorOffset(info, batchIndex, channelIndex, y, 0), srcData + GetLinearBufferOffset(info, batchIndex, channelIndex, y, 0), width * sizeof(T)); @@ -156,5 +166,34 @@ void CopyArmComputeITensorData(const T* srcData, arm_compute::ITensor& dstTensor } } +/// Construct a TensorShape object from an ArmCompute object based on arm_compute::Dimensions. +/// \tparam ArmComputeType Any type that implements the Dimensions interface +/// \tparam T Shape value type +/// \param shapelike An ArmCompute object that implements the Dimensions interface +/// \param initial A default value to initialise the shape with +/// \return A TensorShape object filled from the Acl shapelike object. +template +TensorShape GetTensorShape(const ArmComputeType& shapelike, T initial) +{ + std::vector s(MaxNumOfTensorDimensions, initial); + for (unsigned int i=0; i < shapelike.num_dimensions(); ++i) + { + s[(shapelike.num_dimensions()-1)-i] = boost::numeric_cast(shapelike[i]); + } + return TensorShape(boost::numeric_cast(shapelike.num_dimensions()), s.data()); +}; + +/// Get the strides from an ACL strides object +inline TensorShape GetStrides(const arm_compute::Strides& strides) +{ + return GetTensorShape(strides, 0U); +} + +/// Get the shape from an ACL shape object +inline TensorShape GetShape(const arm_compute::TensorShape& shape) +{ + return GetTensorShape(shape, 1U); +} + } // namespace armcomputetensorutils } // namespace armnn diff --git a/src/armnn/backends/ArmComputeUtils.hpp b/src/armnn/backends/ArmComputeUtils.hpp index c451e6434b..3c57fb59b7 100644 --- a/src/armnn/backends/ArmComputeUtils.hpp +++ b/src/armnn/backends/ArmComputeUtils.hpp @@ -36,7 +36,7 @@ CreateAclNormalizationLayerInfoForL2Normalization(const armnn::TensorInfo& tenso // For the reference implementation, to make alpha_ become 1, we'd have to use alpha = normSize instead. const float alpha = 1.0f; - // Don't offset the reduction + // Don't offset the reduction. const float kappa = 0.0f; // pow(reduction, -0.5) = 1 / sqrt(reduction) @@ -53,7 +53,7 @@ ConvertActivationFunctionToAclActivationFunction(ActivationFunction armnnFunctio switch (armnnFunction) { case ActivationFunction::Linear: return AclActivationFunction::LINEAR; - // Arm compute's 'logistic' function is non-parameterized, so it is exactly a sigmoid function + // Arm compute's 'logistic' function is non-parameterized, so it is exactly a sigmoid function. case ActivationFunction::Sigmoid: return AclActivationFunction::LOGISTIC; case ActivationFunction::ReLu: return AclActivationFunction::RELU; case ActivationFunction::BoundedReLu: return AclActivationFunction::LU_BOUNDED_RELU; @@ -112,6 +112,14 @@ ConvertNormalizationAlgorithmChannelToAclNormType(NormalizationAlgorithmChannel } } +inline arm_compute::FullyConnectedLayerInfo +ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(const FullyConnectedDescriptor& fullyConnectedDesc) +{ + arm_compute::FullyConnectedLayerInfo fc_info; + fc_info.transpose_weights = fullyConnectedDesc.m_TransposeWeightMatrix; + return fc_info; +} + } #endif // ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED diff --git a/src/armnn/backends/ClContextControl.cpp b/src/armnn/backends/ClContextControl.cpp index f086328e55..68e878da79 100644 --- a/src/armnn/backends/ClContextControl.cpp +++ b/src/armnn/backends/ClContextControl.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #include "LeakChecking.hpp" @@ -29,22 +30,27 @@ class Device; namespace armnn { -ClContextControl::ClContextControl(IClTunedParameters* clTunedParameters) +ClContextControl::ClContextControl(IGpuAccTunedParameters* clTunedParameters, + bool profilingEnabled) : m_clTunedParameters(boost::polymorphic_downcast(clTunedParameters)) + , m_ProfilingEnabled(profilingEnabled) { + // Ignore m_ProfilingEnabled if unused to avoid compiling problems when ArmCompute is disabled. + boost::ignore_unused(m_ProfilingEnabled); + #ifdef ARMCOMPUTECL_ENABLED try { std::vector platforms; cl::Platform::get(&platforms); - // Select default platform as the first element + // Selects default platform for the first element. cl::Platform::setDefault(platforms[0]); std::vector devices; platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices); - // Select default device as the first element + // Selects default device for the first element. cl::Device::setDefault(devices[0]); } catch (const cl::Error& clError) @@ -54,15 +60,15 @@ ClContextControl::ClContextControl(IClTunedParameters* clTunedParameters) ) % clError.what() % clError.err())); } - // Remove the use of global CL context + // Removes the use of global CL context. cl::Context::setDefault(cl::Context{}); BOOST_ASSERT(cl::Context::getDefault()() == NULL); - // Remove the use of global CL command queue + // Removes the use of global CL command queue. cl::CommandQueue::setDefault(cl::CommandQueue{}); BOOST_ASSERT(cl::CommandQueue::getDefault()() == NULL); - // always load the OpenCL runtime + // Always load the OpenCL runtime. LoadOpenClRuntime(); #endif } @@ -70,14 +76,14 @@ ClContextControl::ClContextControl(IClTunedParameters* clTunedParameters) ClContextControl::~ClContextControl() { #ifdef ARMCOMPUTECL_ENABLED - // load the OpencCL runtime without the tuned parameters to free the memory for them + // Load the OpencCL runtime without the tuned parameters to free the memory for them. try { UnloadOpenClRuntime(); } catch (const cl::Error& clError) { - // this should not happen, it is ignored if it does + // This should not happen, it is ignored if it does. // Coverity fix: BOOST_LOG_TRIVIAL (previously used here to report the error) may throw an // exception of type std::length_error. @@ -107,23 +113,23 @@ void ClContextControl::DoLoadOpenClRuntime(bool useTunedParameters) if (arm_compute::CLScheduler::get().context()() != NULL) { - // wait for all queued CL requests to finish before reinitialising it + // Wait for all queued CL requests to finish before reinitialising it. arm_compute::CLScheduler::get().sync(); } try { arm_compute::CLKernelLibrary::get().clear_programs_cache(); - // initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no + // Initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no // context references); it is initialised again, with a proper context, later. arm_compute::CLScheduler::get().init(context, commandQueue, device); arm_compute::CLKernelLibrary::get().init(".", context, device); { // - // Here we replace the context with a new one which in - // the memory leak checks shows as an extra allocation but - // because of the scope of the leak check it doesn't count + // Here we replace the context with a new one in which + // the memory leak checks show it as an extra allocation but + // because of the scope of the leak checks, it doesn't count // the disposal of the original object. On the other hand it // does count the creation of this context which it flags // as a memory leak. By adding the following line we prevent @@ -133,24 +139,19 @@ void ClContextControl::DoLoadOpenClRuntime(bool useTunedParameters) context = cl::Context(device); } - bool enableProfiling = false; -#if ARMNN_PROFILING_ENABLED - enableProfiling = true; -#endif - if (useTunedParameters && - m_clTunedParameters && m_clTunedParameters->m_Mode == IClTunedParameters::Mode::UpdateTunedParameters) - { - enableProfiling = true; // Needed for the CLTuner to work. - } + // NOTE: In this specific case profiling has to be enabled on the command queue + // in order for the CLTuner to work. + bool profilingNeededForClTuner = useTunedParameters && m_clTunedParameters && + m_clTunedParameters->m_Mode == IGpuAccTunedParameters::Mode::UpdateTunedParameters; - if (enableProfiling) + if (m_ProfilingEnabled || profilingNeededForClTuner) { - // Create a new queue with profiling enabled + // Create a new queue with profiling enabled. commandQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE); } else { - // Use default queue + // Use default queue. commandQueue = cl::CommandQueue(context, device); } } @@ -178,22 +179,22 @@ void ClContextControl::ClearClCache() DoLoadOpenClRuntime(true); } -armnn::IClTunedParameters* IClTunedParameters::CreateRaw(armnn::IClTunedParameters::Mode mode) +armnn::IGpuAccTunedParameters* IGpuAccTunedParameters::CreateRaw(armnn::IGpuAccTunedParameters::Mode mode) { return new ClTunedParameters(mode); } -armnn::IClTunedParametersPtr IClTunedParameters::Create(armnn::IClTunedParameters::Mode mode) +armnn::IGpuAccTunedParametersPtr IGpuAccTunedParameters::Create(armnn::IGpuAccTunedParameters::Mode mode) { - return IClTunedParametersPtr(CreateRaw(mode), &IClTunedParameters::Destroy); + return IGpuAccTunedParametersPtr(CreateRaw(mode), &IGpuAccTunedParameters::Destroy); } -void IClTunedParameters::Destroy(IClTunedParameters* params) +void IGpuAccTunedParameters::Destroy(IGpuAccTunedParameters* params) { delete params; } -ClTunedParameters::ClTunedParameters(armnn::IClTunedParameters::Mode mode) +ClTunedParameters::ClTunedParameters(armnn::IGpuAccTunedParameters::Mode mode) : m_Mode(mode) #ifdef ARMCOMPUTECL_ENABLED , m_Tuner(mode == ClTunedParameters::Mode::UpdateTunedParameters) diff --git a/src/armnn/backends/ClContextControl.hpp b/src/armnn/backends/ClContextControl.hpp index 8098e30b75..ee1b797055 100644 --- a/src/armnn/backends/ClContextControl.hpp +++ b/src/armnn/backends/ClContextControl.hpp @@ -13,15 +13,16 @@ namespace armnn { -class IClTunedParameters; +class IGpuAccTunedParameters; class ClTunedParameters; -// ARM Compute OpenCL context control +// ARM Compute OpenCL context control. class ClContextControl { public: - ClContextControl(IClTunedParameters* clTunedParameters = nullptr); + ClContextControl(IGpuAccTunedParameters* clTunedParameters = nullptr, + bool profilingEnabled = false); virtual ~ClContextControl(); @@ -31,7 +32,7 @@ public: // to release the cached memory used by the compute library. void UnloadOpenClRuntime(); - // Clear the CL cache, without losing the tuned parameter settings + // Clear the CL cache, without losing the tuned parameter settings. void ClearClCache(); private: @@ -40,12 +41,13 @@ private: ClTunedParameters* m_clTunedParameters; + bool m_ProfilingEnabled; }; -class ClTunedParameters : public IClTunedParameters +class ClTunedParameters : public IGpuAccTunedParameters { public: - ClTunedParameters(armnn::IClTunedParameters::Mode mode); + ClTunedParameters(armnn::IGpuAccTunedParameters::Mode mode); virtual void Load(const char* filename); virtual void Save(const char* filename) const; diff --git a/src/armnn/backends/ClLayerSupport.cpp b/src/armnn/backends/ClLayerSupport.cpp index 8905adf1fc..72594ac82b 100644 --- a/src/armnn/backends/ClLayerSupport.cpp +++ b/src/armnn/backends/ClLayerSupport.cpp @@ -7,7 +7,6 @@ #include "ClLayerSupport.hpp" #include "InternalTypes.hpp" - #include #include #include @@ -16,10 +15,21 @@ #ifdef ARMCOMPUTECL_ENABLED #include "ClWorkloads/ClAdditionFloat32Workload.hpp" +#include "ClWorkloads/ClActivationFloat32Workload.hpp" +#include "ClWorkloads/ClBatchNormalizationFloat32Workload.hpp" + +#include "ClWorkloads/ClConvertFp16ToFp32Workload.hpp" +#include "ClWorkloads/ClConvertFp32ToFp16Workload.hpp" #include "ClWorkloads/ClConvolution2dBaseWorkload.hpp" +#include "ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp" +#include "ClWorkloads/ClL2NormalizationFloat32Workload.hpp" +#include "ClWorkloads/ClMultiplicationFloat32Workload.hpp" +#include "ClWorkloads/ClFullyConnectedFloat32Workload.hpp" #include "ClWorkloads/ClPooling2dBaseWorkload.hpp" #include "ClWorkloads/ClPermuteWorkload.hpp" #include "ClWorkloads/ClNormalizationFloat32Workload.hpp" +#include "ClWorkloads/ClSoftmaxBaseWorkload.hpp" +#include "ClWorkloads/ClLstmFloat32Workload.hpp" #endif using namespace boost; @@ -31,7 +41,7 @@ namespace template bool IsMatchingSize2d(const TensorInfo& weightInfo) { - // Width & Height must match + // Width & Height must match. return (weightInfo.GetShape()[3] == FilterSize) && (weightInfo.GetShape()[2] == FilterSize); } @@ -88,58 +98,10 @@ inline bool IsWorkloadSupported(FuncType&& func, std::string* reasonIfUnsupporte } //namespace -bool IsClActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters) -{ - if (parameters.m_Function != ActivationFunction::BoundedReLu) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "Unsupported activation function, only BoundedReLu is supported"; - } - - return false; - } - - return true; -} - -bool IsClDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported, - const DepthwiseConvolution2dDescriptor& parameters, - const TensorInfo& weights) -{ - if (weights.GetNumDimensions() != 4) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "Depthwise convolution Weight tensor needs to be 4d"; - } - return false; - } - // weights.GetShape()[0] = channel multiplier - if (weights.GetShape()[0] != 1) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "Channel multiplier only supports the value 1 in the CL backend"; - } - return false; - } - else if ((weights.GetDataType() == armnn::DataType::QuantisedAsymm8) && !IsMatchingSize2d<3>(weights)) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "CL backend only supports 3x3 filtering for Depthwise Convolution on 8-bit"; - } - return false; - } - - return true; -} - -template +template bool IsSupportedForDataTypeCl(std::string* reasonIfUnsupported, DataType dataType, - Float32Func floatFuncPtr, + FloatFunc floatFuncPtr, Uint8Func uint8FuncPtr, Params&&... params) { @@ -147,19 +109,21 @@ bool IsSupportedForDataTypeCl(std::string* reasonIfUnsupported, IsSupportedForDataTypeGeneric(reasonIfUnsupported, dataType, floatFuncPtr, + floatFuncPtr, uint8FuncPtr, std::forward(params)...); } bool IsActivationSupportedCl(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc, - &IsClActivationUint8Supported, - descriptor); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClActivationWorkloadValidate, + reasonIfUnsupported, + input, + output, + descriptor); } bool IsAdditionSupportedCl(const TensorInfo& input0, @@ -167,21 +131,30 @@ bool IsAdditionSupportedCl(const TensorInfo& input0, const TensorInfo& output, std::string* reasonIfUnsupported) { - return FORWARD_CL_LAYER_SUPPORT_FUNC(ClAdditionFloat32Workload::IsSupported(input0, + return FORWARD_CL_LAYER_SUPPORT_FUNC(ClAdditionValidate(input0, input1, output, reasonIfUnsupported)); } bool IsBatchNormalizationSupportedCl(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc, - &FalseFuncU8, - descriptor); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClBatchNormalizationValidate, + reasonIfUnsupported, + input, + output, + mean, + var, + beta, + gamma, + descriptor); } bool IsConstantSupportedCl(const TensorInfo& output, @@ -206,20 +179,20 @@ bool IsClDirectConvolution2dSupported(const TensorInfo& weightInfo, const Convol bool strideIsOneOrTwo = strideXIsOneOrTwo && strideYIsOneOrTwo; bool strideIsOneOrTwoOrThree = ( strideXIsOneOrTwo || strideXIsThree ) && ( strideYIsOneOrTwo || strideYIsThree ); - // 1x1 convolution with strides of 1,2,3 + // 1x1 convolution with strides of 1,2,3. isSupported |= IsMatchingSize2d<1>(weightInfo) && ( strideIsOneOrTwoOrThree ); - // 3x3 convolution with strides of 1,2 + // 3x3 convolution with strides of 1,2. isSupported |= IsMatchingSize2d<3>(weightInfo) && ( strideIsOneOrTwo ); // 5x5 convolution with strides of 1,2 isSupported |= IsMatchingSize2d<5>(weightInfo) && ( strideIsOneOrTwo ); - //fall back to normal convolution for the asymmetric padding case. + //Fall back to normal convolution for the asymmetric padding case. if (desc.m_PadLeft != desc.m_PadRight || desc.m_PadTop != desc.m_PadBottom) { - //direct convolution does not support asymmetric padding yet. + //Direct convolution does not support asymmetric padding yet. isSupported = false; } @@ -250,27 +223,40 @@ bool IsConvolution2dSupportedCl(const TensorInfo& input, } bool IsDepthwiseConvolutionSupportedCl(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &IsClDepthwiseConvolution2dDescParamsSupported, - &IsClDepthwiseConvolution2dDescParamsSupported, - descriptor, - weights); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClDepthwiseConvolutionWorkloadValidate, + reasonIfUnsupported, + input, + output, + descriptor, + weights, + biases); } bool IsFullyConnectedSupportedCl(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported) { - ignore_unused(descriptor); - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + // At the moment U8 is unsupported + if (input.GetDataType() == DataType::QuantisedAsymm8) + { + return false; + } + FORWARD_WORKLOAD_VALIDATE_FUNC(ClFullyConnectedWorkloadValidate, + reasonIfUnsupported, + input, + output, + weights, + biases, + descriptor); } bool IsInputSupportedCl(const TensorInfo& input, @@ -283,12 +269,10 @@ bool IsInputSupportedCl(const TensorInfo& input, } bool IsL2NormalizationSupportedCl(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClL2NormalizationWorkloadValidate, reasonIfUnsupported, input, output); } bool IsMergerSupportedCl(const std::vector inputs, @@ -304,13 +288,14 @@ bool IsMergerSupportedCl(const std::vector inputs, bool IsMultiplicationSupportedCl(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported) { - ignore_unused(input1); - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input0.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClMultiplicationWorkloadValidate, + reasonIfUnsupported, + input0, + input1, + output); } bool IsNormalizationSupportedCl(const TensorInfo& input, @@ -358,14 +343,12 @@ bool IsResizeBilinearSupportedCl(const TensorInfo& input, } bool IsSoftmaxSupportedCl(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported) { ignore_unused(descriptor); - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &TrueFunc<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClSoftmaxWorkloadValidate, reasonIfUnsupported, input, output); } bool IsSplitterSupportedCl(const TensorInfo& input, @@ -400,10 +383,59 @@ bool IsFloorSupportedCl(const TensorInfo& input, std::string* reasonIfUnsupported) { ignore_unused(output); - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + return IsClBackendSupported(reasonIfUnsupported) && + IsSupportedForDataTypeGeneric(reasonIfUnsupported, + input.GetDataType(), + &FalseFuncF16<>, + &TrueFunc<>, + &FalseFuncU8<>); +} + +bool IsLstmSupportedCl(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported) +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(ClLstmFloat32WorkloadValidate, reasonIfUnsupported, + input, outputStateIn, cellStateIn, scratchBuffer, outputStateOut, cellStateOut, + output, descriptor, inputToForgetWeights, inputToCellWeights, + inputToOutputWeights, recurrentToForgetWeights, + recurrentToCellWeights, recurrentToOutputWeights, + forgetGateBias, cellBias, outputGateBias, + inputToInputWeights, recurrentToInputWeights, + cellToInputWeights, inputGateBias, projectionWeights, + projectionBias, cellToForgetWeights, cellToOutputWeights); +} + +bool IsConvertFp16ToFp32SupportedCl(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(ClConvertFp16ToFp32WorkloadValidate, + reasonIfUnsupported, + input, + output, + reasonIfUnsupported); +} + +bool IsConvertFp32ToFp16SupportedCl(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(ClConvertFp32ToFp16WorkloadValidate, + reasonIfUnsupported, + input, + output, + reasonIfUnsupported); } } diff --git a/src/armnn/backends/ClLayerSupport.hpp b/src/armnn/backends/ClLayerSupport.hpp index 4f71e907cf..791e904616 100644 --- a/src/armnn/backends/ClLayerSupport.hpp +++ b/src/armnn/backends/ClLayerSupport.hpp @@ -7,16 +7,17 @@ #include #include #include +#include namespace armnn { bool IsClDirectConvolution2dSupported(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc); -bool IsClActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters); bool IsClDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported, const DepthwiseConvolution2dDescriptor& parameters, const TensorInfo& weights); bool IsActivationSupportedCl(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -26,6 +27,11 @@ bool IsAdditionSupportedCl(const TensorInfo& input0, std::string* reasonIfUnsupported = nullptr); bool IsBatchNormalizationSupportedCl(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -40,11 +46,16 @@ bool IsConvolution2dSupportedCl(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsDepthwiseConvolutionSupportedCl(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported = nullptr); bool IsFullyConnectedSupportedCl(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -52,14 +63,30 @@ bool IsInputSupportedCl(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsL2NormalizationSupportedCl(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); +bool IsLstmSupportedCl(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr); + bool IsMergerSupportedCl(const std::vector inputs, const OriginsDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); bool IsMultiplicationSupportedCl(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); bool IsNormalizationSupportedCl(const TensorInfo& input, @@ -84,6 +111,7 @@ bool IsResizeBilinearSupportedCl(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsSoftmaxSupportedCl(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -101,4 +129,13 @@ bool IsReshapeSupportedCl(const TensorInfo& input, bool IsFloorSupportedCl(const TensorInfo& input, const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); + +bool IsConvertFp16ToFp32SupportedCl(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + +bool IsConvertFp32ToFp16SupportedCl(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + } diff --git a/src/armnn/backends/ClTensorHandle.hpp b/src/armnn/backends/ClTensorHandle.hpp index 49e18dad59..e3618a3c46 100644 --- a/src/armnn/backends/ClTensorHandle.hpp +++ b/src/armnn/backends/ClTensorHandle.hpp @@ -9,9 +9,12 @@ #include #include +#include +#include #include #include +#include namespace armnn { @@ -22,9 +25,8 @@ class IClTensorHandle : public ITensorHandle public: virtual arm_compute::ICLTensor& GetTensor() = 0; virtual arm_compute::ICLTensor const& GetTensor() const = 0; - virtual void Map(bool blocking = true) = 0; - virtual void UnMap() = 0; virtual arm_compute::DataType GetDataType() const = 0; + virtual void SetMemoryGroup(const std::shared_ptr& memoryGroup) = 0; }; class ClTensorHandle : public IClTensorHandle @@ -37,50 +39,98 @@ public: arm_compute::CLTensor& GetTensor() override { return m_Tensor; } arm_compute::CLTensor const& GetTensor() const override { return m_Tensor; } - virtual void Allocate() override {armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);}; + virtual void Allocate() override {armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);} - virtual void Map(bool blocking = true) override {m_Tensor.map(blocking);} - virtual void UnMap() override { m_Tensor.unmap();} + virtual void Manage() override + { + assert(m_MemoryGroup != nullptr); + m_MemoryGroup->manage(&m_Tensor); + } - virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL;} + virtual const void* Map(bool blocking = true) const override + { + const_cast(&m_Tensor)->map(blocking); + return static_cast(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + virtual void Unmap() const override { const_cast(&m_Tensor)->unmap(); } + + virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL; } + + virtual ITensorHandle* GetParent() const override { return nullptr; } virtual arm_compute::DataType GetDataType() const override { return m_Tensor.info()->data_type(); } + virtual void SetMemoryGroup(const std::shared_ptr& memoryGroup) override + { + m_MemoryGroup = boost::polymorphic_pointer_downcast(memoryGroup); + } + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } private: arm_compute::CLTensor m_Tensor; - + std::shared_ptr m_MemoryGroup; }; class ClSubTensorHandle : public IClTensorHandle { public: - ClSubTensorHandle(arm_compute::ICLTensor& parent, - const arm_compute::TensorShape& shape, - const arm_compute::Coordinates& coords) - : m_Tensor(&parent, shape, coords) + ClSubTensorHandle(IClTensorHandle* parent, + const arm_compute::TensorShape& shape, + const arm_compute::Coordinates& coords) + : m_Tensor(&parent->GetTensor(), shape, coords) { + parentHandle = parent; } arm_compute::CLSubTensor& GetTensor() override { return m_Tensor; } arm_compute::CLSubTensor const& GetTensor() const override { return m_Tensor; } - virtual void Allocate() override {}; - virtual void Map(bool blocking = true) override {m_Tensor.map(blocking);} - virtual void UnMap() override { m_Tensor.unmap();} + virtual void Allocate() override {} + virtual void Manage() override {} - virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL;} + virtual const void* Map(bool blocking = true) const override + { + const_cast(&m_Tensor)->map(blocking); + return static_cast(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + virtual void Unmap() const override { const_cast(&m_Tensor)->unmap(); } + + virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL; } + + virtual ITensorHandle* GetParent() const override { return parentHandle; } virtual arm_compute::DataType GetDataType() const override { return m_Tensor.info()->data_type(); } + virtual void SetMemoryGroup(const std::shared_ptr&) override {} + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } + private: - arm_compute::CLSubTensor m_Tensor; + mutable arm_compute::CLSubTensor m_Tensor; + ITensorHandle* parentHandle = nullptr; }; -} \ No newline at end of file +} diff --git a/src/armnn/backends/ClWorkloadFactory.cpp b/src/armnn/backends/ClWorkloadFactory.cpp index 916ca46aae..354440c7bc 100644 --- a/src/armnn/backends/ClWorkloadFactory.cpp +++ b/src/armnn/backends/ClWorkloadFactory.cpp @@ -15,9 +15,13 @@ #include #include #include + +#include "ClWorkloads.hpp" + #include "backends/MemCopyWorkload.hpp" #include "backends/ClTensorHandle.hpp" -#include "ClWorkloads.hpp" + +#include "memory/IPoolManager.hpp" #endif #include "MakeWorkloadHelper.hpp" @@ -29,7 +33,9 @@ namespace armnn { -bool ClWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported) +bool ClWorkloadFactory::IsLayerSupported(const Layer& layer, + boost::optional dataType, + std::string& outReasonIfUnsupported) { return IWorkloadFactory::IsLayerSupported(Compute::GpuAcc, layer, dataType, outReasonIfUnsupported); } @@ -43,7 +49,10 @@ ClWorkloadFactory::ClWorkloadFactory() std::unique_ptr ClWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const { - return std::make_unique(tensorInfo); + std::unique_ptr tensorHandle = std::make_unique(tensorInfo); + tensorHandle->SetMemoryGroup(m_MemoryManager.GetInterLayerMemoryGroup()); + + return tensorHandle; } std::unique_ptr ClWorkloadFactory::CreateSubTensorHandle(ITensorHandle& parent, @@ -58,24 +67,25 @@ std::unique_ptr ClWorkloadFactory::CreateSubTensorHandle(ITensorH coords.set_num_dimensions(subTensorShape.GetNumDimensions()); for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); i++) { - // arm compute indexes tensor coords in reverse order + // Arm compute indexes tensor coords in reverse order. unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1; coords.set(i, boost::numeric_cast(subTensorOrigin[revertedIndex])); } - return std::make_unique(static_cast(parent).GetTensor(), shape, coords); + return std::make_unique( + boost::polymorphic_downcast(&parent), shape, coords); } std::unique_ptr ClWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info); + return MakeWorkload(descriptor, info); } std::unique_ptr ClWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info); + return MakeWorkload(descriptor, info); } std::unique_ptr ClWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor, @@ -87,7 +97,8 @@ std::unique_ptr ClWorkloadFactory::CreateActivation(const ActivationQ std::unique_ptr ClWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info, m_MemoryManager.Get()); + return MakeWorkload(descriptor, info, + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr ClWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor, @@ -105,13 +116,14 @@ std::unique_ptr ClWorkloadFactory::CreateMerger(const MergerQu std::unique_ptr ClWorkloadFactory::CreateFullyConnected( const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info, m_MemoryManager.Get()); + return MakeWorkload(descriptor, info, + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr ClWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info); + return MakeWorkload(descriptor, info); } std::unique_ptr ClWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor, @@ -124,7 +136,7 @@ std::unique_ptr ClWorkloadFactory::CreateConvolution2d(const C const WorkloadInfo& info) const { return MakeWorkload(descriptor, info, - m_MemoryManager.Get()); + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr ClWorkloadFactory::CreateDepthwiseConvolution2d( @@ -142,7 +154,7 @@ std::unique_ptr ClWorkloadFactory::CreateNormalization(const N std::unique_ptr ClWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info); + return MakeWorkload(descriptor, info); } std::unique_ptr ClWorkloadFactory::CreateMultiplication( @@ -165,21 +177,7 @@ std::unique_ptr ClWorkloadFactory::CreateMemCopy(const MemCopy throw InvalidArgumentException("ClWorkloadFactory: Invalid null input for MemCopy workload"); } - // Create a workload that will copy tensor data from the inputs, which can have a number of different formats, - // to CL tensors. - switch (descriptor.m_Inputs[0]->GetType()) - { - case ITensorHandle::Cpu: - return MakeWorkload(descriptor, info); -#if ARMCOMPUTENEON_ENABLED - case ITensorHandle::Neon: - { - return MakeWorkload(descriptor, info); - } -#endif - default: - throw InvalidArgumentException("ClWorkloadFactory: Destination type not supported for MemCopy Workload."); - } + return MakeWorkload(descriptor, info); } std::unique_ptr ClWorkloadFactory::CreateResizeBilinear( @@ -220,11 +218,41 @@ std::unique_ptr ClWorkloadFactory::CreateFloor(const FloorQueueDescri return MakeWorkload(descriptor, info); } +std::unique_ptr ClWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return MakeWorkload(descriptor, info); +} + +std::unique_ptr ClWorkloadFactory::CreateConvertFp16ToFp32( + const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique(descriptor, info); +} + +std::unique_ptr ClWorkloadFactory::CreateConvertFp32ToFp16( + const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique(descriptor, info); +} + void ClWorkloadFactory::Finalize() { m_MemoryManager.Finalize(); } +void ClWorkloadFactory::Release() +{ + m_MemoryManager.Release(); +} + +void ClWorkloadFactory::Acquire() +{ + m_MemoryManager.Acquire(); +} + #else // #if ARMCOMPUTECL_ENABLED ClWorkloadFactory::ClWorkloadFactory() @@ -375,10 +403,38 @@ std::unique_ptr ClWorkloadFactory::CreateFloor(const FloorQueueDescri return nullptr; } +std::unique_ptr ClWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + +std::unique_ptr ClWorkloadFactory::CreateConvertFp16ToFp32( + const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + +std::unique_ptr ClWorkloadFactory::CreateConvertFp32ToFp16( + const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + void ClWorkloadFactory::Finalize() { } +void ClWorkloadFactory::Release() +{ +} + +void ClWorkloadFactory::Acquire() +{ +} + #endif // #if ARMCOMPUTECL_ENABLED } // namespace armnn diff --git a/src/armnn/backends/ClWorkloadFactory.hpp b/src/armnn/backends/ClWorkloadFactory.hpp index 7365fe9aeb..d0786f3fba 100644 --- a/src/armnn/backends/ClWorkloadFactory.hpp +++ b/src/armnn/backends/ClWorkloadFactory.hpp @@ -4,14 +4,17 @@ // #pragma once -#include "AclBaseMemoryManager.hpp" #include "OutputHandler.hpp" + #include "armnn/IRuntime.hpp" +#include + +#include "memory/BaseMemoryManager.hpp" namespace armnn { -// ARM Compute OpenCL workload factory +// ARM Compute OpenCL workload factory. class ClWorkloadFactory : public IWorkloadFactory { public: @@ -19,7 +22,8 @@ public: virtual Compute GetCompute() const override { return Compute::GpuAcc; } - static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported); + static bool IsLayerSupported(const Layer& layer, boost::optional dataType, + std::string& outReasonIfUnsupported); virtual bool SupportsSubTensors() const override { return true; } @@ -95,11 +99,26 @@ public: virtual std::unique_ptr CreateFloor(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) const override; - void Finalize() override; + virtual std::unique_ptr CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual std::unique_ptr CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual std::unique_ptr CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual void Finalize() override; + + virtual void Release() override; + + virtual void Acquire() override; private: - mutable AclBaseMemoryManager m_MemoryManager; +#ifdef ARMCOMPUTECL_ENABLED + mutable ClMemoryManager m_MemoryManager; +#endif }; } // namespace armnn diff --git a/src/armnn/backends/ClWorkloadUtils.hpp b/src/armnn/backends/ClWorkloadUtils.hpp index 549a0bbc25..6b6a18e865 100644 --- a/src/armnn/backends/ClWorkloadUtils.hpp +++ b/src/armnn/backends/ClWorkloadUtils.hpp @@ -9,6 +9,15 @@ #include #include #include "ArmComputeTensorUtils.hpp" +#include "OpenClTimer.hpp" +#include "CpuTensorHandle.hpp" +#include "Half.hpp" + +#define ARMNN_SCOPED_PROFILING_EVENT_CL(name) \ + ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::GpuAcc, \ + name, \ + armnn::OpenClTimer(), \ + armnn::WallClockTimer()) namespace armnn { @@ -17,12 +26,12 @@ template void CopyArmComputeClTensorData(const T* srcData, arm_compute::CLTensor& dstTensor) { { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "MapClTensorForWriting"); + ARMNN_SCOPED_PROFILING_EVENT_CL("MapClTensorForWriting"); dstTensor.map(true); } { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyToClTensor"); + ARMNN_SCOPED_PROFILING_EVENT_CL("CopyToClTensor"); armcomputetensorutils::CopyArmComputeITensorData(srcData, dstTensor); } @@ -36,4 +45,21 @@ void InitialiseArmComputeClTensorData(arm_compute::CLTensor& clTensor, const T* CopyArmComputeClTensorData(data, clTensor); } +inline void InitializeArmComputeClTensorDataForFloatTypes(arm_compute::CLTensor& clTensor, + const ConstCpuTensorHandle *handle) +{ + BOOST_ASSERT(handle); + switch(handle->GetTensorInfo().GetDataType()) + { + case DataType::Float16: + InitialiseArmComputeClTensorData(clTensor, handle->GetConstTensor()); + break; + case DataType::Float32: + InitialiseArmComputeClTensorData(clTensor, handle->GetConstTensor()); + break; + default: + BOOST_ASSERT_MSG(false, "Unexpected floating point type."); + } +}; + } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads.hpp b/src/armnn/backends/ClWorkloads.hpp index 3b8cf50ace..9f5622a491 100644 --- a/src/armnn/backends/ClWorkloads.hpp +++ b/src/armnn/backends/ClWorkloads.hpp @@ -7,6 +7,7 @@ #include "backends/ClWorkloads/ClActivationFloat32Workload.hpp" #include "backends/ClWorkloads/ClActivationUint8Workload.hpp" #include "backends/ClWorkloads/ClAdditionFloat32Workload.hpp" +#include "backends/ClWorkloads/ClAdditionUint8Workload.hpp" #include "backends/ClWorkloads/ClBaseConstantWorkload.hpp" #include "backends/ClWorkloads/ClBaseMergerWorkload.hpp" #include "backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp" @@ -19,6 +20,7 @@ #include "backends/ClWorkloads/ClFloorFloat32Workload.hpp" #include "backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp" #include "backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp" +#include "backends/ClWorkloads/ClLstmFloat32Workload.hpp" #include "backends/ClWorkloads/ClMergerFloat32Workload.hpp" #include "backends/ClWorkloads/ClMergerUint8Workload.hpp" #include "backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp" @@ -32,4 +34,6 @@ #include "backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp" #include "backends/ClWorkloads/ClSoftmaxUint8Workload.hpp" #include "backends/ClWorkloads/ClSplitterFloat32Workload.hpp" -#include "backends/ClWorkloads/ClSplitterUint8Workload.hpp" \ No newline at end of file +#include "backends/ClWorkloads/ClSplitterUint8Workload.hpp" +#include "backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp" +#include "backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp" diff --git a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp index fb5d78425e..f072549cbc 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp @@ -9,10 +9,31 @@ namespace armnn { +arm_compute::Status ClActivationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const ActivationDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + const arm_compute::ActivationLayerInfo activationLayerInfo = + ConvertActivationDescriptorToAclActivationLayerInfo(descriptor); + + if (input.GetDataType() == DataType::QuantisedAsymm8 && + activationLayerInfo.activation() == arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR, + "CL: Logistic Activations unsupported with QAsymm8 data type."}; + } + + return arm_compute::CLActivationLayer::validate(&aclInput, + &aclOutput, + activationLayerInfo); +} ClActivationFloat32Workload::ClActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("ClActivationFloat32Workload", 1, 1); @@ -26,7 +47,7 @@ ClActivationFloat32Workload::ClActivationFloat32Workload(const ActivationQueueDe void ClActivationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClActivationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClActivationFloat32Workload_Execute"); m_ActivationLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp index 9bab4202be..9fbfe95856 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp @@ -9,9 +9,12 @@ namespace armnn { +arm_compute::Status ClActivationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const ActivationDescriptor& descriptor); -// Activation layer execution -class ClActivationFloat32Workload : public Float32Workload +// Activation layer execution. +class ClActivationFloat32Workload : public FloatWorkload { public: ClActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp index 3671dd7187..75ab3d0691 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp @@ -6,6 +6,7 @@ #include "ClActivationUint8Workload.hpp" #include "backends/ClLayerSupport.hpp" +#include "backends/ArmComputeUtils.hpp" #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" namespace armnn @@ -15,15 +16,8 @@ ClActivationUint8Workload::ClActivationUint8Workload(const ActivationQueueDescri const WorkloadInfo& info) : Uint8Workload(descriptor, info) { - - std::string reasonIfUnsupported; - if (!IsClActivationUint8Supported(&reasonIfUnsupported, m_Data.m_Parameters)) - { - throw InvalidArgumentException(reasonIfUnsupported); - } - - // Only BoundedReLu is supported (see IsClActivationUint8Supported) - arm_compute::ActivationLayerInfo layerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + auto activation = ConvertActivationFunctionToAclActivationFunction(m_Data.m_Parameters.m_Function); + arm_compute::ActivationLayerInfo layerInfo(activation, m_Data.m_Parameters.m_A, m_Data.m_Parameters.m_B); @@ -37,7 +31,7 @@ ClActivationUint8Workload::ClActivationUint8Workload(const ActivationQueueDescri void ClActivationUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClActivationUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClActivationUint8Workload_Execute"); m_ActivationLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp index 3a9cceb298..449b2d56c5 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -// Activation layer execution +// Activation layer execution. class ClActivationUint8Workload : public Uint8Workload { public: diff --git a/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp new file mode 100644 index 0000000000..5dd7bb323a --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp @@ -0,0 +1,71 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClAdditionBaseWorkload.hpp" + +#include "backends/ClTensorHandle.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "backends/ArmComputeTensorUtils.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; + +template +ClAdditionBaseWorkload::ClAdditionBaseWorkload(const AdditionQueueDescriptor& descriptor, + const WorkloadInfo& info) + : TypedWorkload(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("ClAdditionBaseWorkload", 2, 1); + + arm_compute::ICLTensor& input0 = static_cast(this->m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& input1 = static_cast(this->m_Data.m_Inputs[1])->GetTensor(); + arm_compute::ICLTensor& output = static_cast(this->m_Data.m_Outputs[0])->GetTensor(); + m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy); +} + +template +void ClAdditionBaseWorkload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionBaseWorkload_Execute"); + m_Layer.run(); +} + +bool ClAdditionValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + if (input0.GetDataType() == DataType::QuantisedAsymm8) + { + // Reject quantised addition for the moment (COMPMID-1385) + *reasonIfUnsupported = "Quantised Addition not yet supported"; + return false; + } + + const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0); + const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + const arm_compute::Status aclStatus = arm_compute::CLArithmeticAddition::validate(&aclInput0Info, + &aclInput1Info, + &aclOutputInfo, + g_AclConvertPolicy); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported && reasonIfUnsupported) + { + *reasonIfUnsupported = aclStatus.error_description(); + } + + return supported; +} + +} //namespace armnn + +template class armnn::ClAdditionBaseWorkload; +template class armnn::ClAdditionBaseWorkload; diff --git a/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp new file mode 100644 index 0000000000..fba8a0d457 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp @@ -0,0 +1,29 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +template +class ClAdditionBaseWorkload : public TypedWorkload +{ +public: + ClAdditionBaseWorkload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info); + + void Execute() const override; + +private: + mutable arm_compute::CLArithmeticAddition m_Layer; +}; + +bool ClAdditionValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output, + std::string* reasonIfUnsupported); +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp index 153167f172..b69593f5f5 100644 --- a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp @@ -13,45 +13,10 @@ namespace armnn { using namespace armcomputetensorutils; -ClAdditionFloat32Workload::ClAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, - const WorkloadInfo& info) - : Float32Workload(descriptor, info) -{ - m_Data.ValidateInputsOutputs("ClAdditionFloat32Workload", 2, 1); - - arm_compute::ICLTensor& input0 = static_cast(m_Data.m_Inputs[0])->GetTensor(); - arm_compute::ICLTensor& input1 = static_cast(m_Data.m_Inputs[1])->GetTensor(); - arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); - m_Layer.configure(&input0, &input1, &output, ms_AclConvertPolicy); -} - void ClAdditionFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClAdditionFloat32Workload_Execute"); - m_Layer.run(); -} - -bool ClAdditionFloat32Workload::IsSupported(const TensorInfo& input0, - const TensorInfo& input1, - const TensorInfo& output, - std::string* reasonIfUnsupported) -{ - const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0); - const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1); - const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); - - const arm_compute::Status aclStatus = decltype(m_Layer)::validate(&aclInput0Info, - &aclInput1Info, - &aclOutputInfo, - ms_AclConvertPolicy); - - const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); - if (!supported && reasonIfUnsupported) - { - *reasonIfUnsupported = aclStatus.error_description(); - } - - return supported; + ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionFloat32Workload_Execute"); + ClAdditionBaseWorkload::Execute(); } -} //namespace armnn \ No newline at end of file +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp index 37e50c2c86..7eac485cfe 100644 --- a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp @@ -5,26 +5,16 @@ #pragma once -#include "backends/ClWorkloadUtils.hpp" +#include "ClAdditionBaseWorkload.hpp" namespace armnn { -class ClAdditionFloat32Workload : public Float32Workload +class ClAdditionFloat32Workload : public ClAdditionBaseWorkload { public: - ClAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info); - + using ClAdditionBaseWorkload::ClAdditionBaseWorkload; void Execute() const override; - - static bool IsSupported(const TensorInfo& input0, - const TensorInfo& input1, - const TensorInfo& output, - std::string* reasonIfUnsupported); - -private: - mutable arm_compute::CLArithmeticAddition m_Layer; - static constexpr arm_compute::ConvertPolicy ms_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; }; -} //namespace armnn \ No newline at end of file +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp new file mode 100644 index 0000000000..a72ceca471 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp @@ -0,0 +1,18 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClAdditionUint8Workload.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +void ClAdditionUint8Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionUint8Workload_Execute"); + ClAdditionBaseWorkload::Execute(); +} + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp new file mode 100644 index 0000000000..73ff287e7e --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp @@ -0,0 +1,20 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "ClAdditionBaseWorkload.hpp" + +namespace armnn +{ + +class ClAdditionUint8Workload : public ClAdditionBaseWorkload +{ +public: + using ClAdditionBaseWorkload::ClAdditionBaseWorkload; + void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp index 4b72d92d72..e0bc365053 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp +++ b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp @@ -4,17 +4,19 @@ // #include "ClBaseConstantWorkload.hpp" +#include "backends/ArmComputeTensorUtils.hpp" #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" +#include "Half.hpp" namespace armnn { -template class ClBaseConstantWorkload; +template class ClBaseConstantWorkload; template class ClBaseConstantWorkload; -template -void ClBaseConstantWorkload::Execute() const +template +void ClBaseConstantWorkload::Execute() const { // The intermediate tensor held by the corresponding layer output handler can be initialised with the given data // on the first inference, then reused for subsequent inferences. @@ -26,15 +28,21 @@ void ClBaseConstantWorkload::Execute() const BOOST_ASSERT(data.m_LayerOutput != nullptr); arm_compute::CLTensor& output = static_cast(data.m_Outputs[0])->GetTensor(); + arm_compute::DataType computeDataType = static_cast(data.m_Outputs[0])->GetDataType(); - switch (dataType) + switch (computeDataType) { - case DataType::Float32: + case arm_compute::DataType::F16: + { + CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor(), output); + break; + } + case arm_compute::DataType::F32: { CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor(), output); break; } - case DataType::QuantisedAsymm8: + case arm_compute::DataType::QASYMM8: { CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor(), output); break; diff --git a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp index 660842f375..7ad7bb93ca 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp @@ -9,12 +9,12 @@ namespace armnn { -template -class ClBaseConstantWorkload : public TypedWorkload +template +class ClBaseConstantWorkload : public TypedWorkload { public: ClBaseConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info) - : TypedWorkload(descriptor, info) + : TypedWorkload(descriptor, info) , m_RanOnce(false) { } diff --git a/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp index 7542c62b47..531e32961b 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp @@ -10,16 +10,16 @@ namespace armnn { -// Base class template providing an implementation of the Merger layer common to all data types -template -class ClBaseMergerWorkload : public TypedWorkload +// Base class template providing an implementation of the Merger layer common to all data types. +template +class ClBaseMergerWorkload : public TypedWorkload { public: - using TypedWorkload::TypedWorkload; + using TypedWorkload::TypedWorkload; void Execute() const override { - // With subtensors, merger is a no-op + // With subtensors, merger is a no-op. } }; diff --git a/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp index fef841ced2..8e4f10f9fd 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp @@ -10,16 +10,16 @@ namespace armnn { -// Base class template providing an implementation of the Splitter layer common to all data types -template -class ClBaseSplitterWorkload : public TypedWorkload +// Base class template providing an implementation of the Splitter layer common to all data types. +template +class ClBaseSplitterWorkload : public TypedWorkload { public: - using TypedWorkload::TypedWorkload; + using TypedWorkload::TypedWorkload; void Execute() const override { - // With subtensors, merger is a no-op + // With subtensors, merger is a no-op. } }; diff --git a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp index dabd495d59..1849c5d411 100644 --- a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp @@ -7,36 +7,88 @@ #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" #include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ClLayerSupport.hpp" namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, + const BatchNormalizationDescriptor &desc) +{ + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclMeanInfo = BuildArmComputeTensorInfo(mean); + const arm_compute::TensorInfo aclVarInfo = BuildArmComputeTensorInfo(var); + const arm_compute::TensorInfo aclBetaInfo = BuildArmComputeTensorInfo(beta); + const arm_compute::TensorInfo aclGammaInfo = BuildArmComputeTensorInfo(gamma); + + return arm_compute::CLBatchNormalizationLayer::validate(&aclInputInfo, + &aclOutputInfo, + &aclMeanInfo, + &aclVarInfo, + &aclBetaInfo, + &aclGammaInfo, + desc.m_Eps); +} + ClBatchNormalizationFloat32Workload::ClBatchNormalizationFloat32Workload( const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { - BuildArmComputeTensor(m_Mean, m_Data.m_Mean->GetTensorInfo()); - BuildArmComputeTensor(m_Variance, m_Data.m_Variance->GetTensorInfo()); - BuildArmComputeTensor(m_Gamma, m_Data.m_Gamma->GetTensorInfo()); - BuildArmComputeTensor(m_Beta, m_Data.m_Beta->GetTensorInfo()); + m_Mean = std::make_unique(); + BuildArmComputeTensor(*m_Mean, m_Data.m_Mean->GetTensorInfo()); + + m_Variance = std::make_unique(); + BuildArmComputeTensor(*m_Variance, m_Data.m_Variance->GetTensorInfo()); + + m_Gamma = std::make_unique(); + BuildArmComputeTensor(*m_Gamma, m_Data.m_Gamma->GetTensorInfo()); + + m_Beta = std::make_unique(); + BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo()); m_Data.ValidateInputsOutputs("ClBatchNormalizationFloat32Workload", 1, 1); arm_compute::ICLTensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); - m_Layer.configure(&input, &output, &m_Mean, &m_Variance, &m_Beta, &m_Gamma, m_Data.m_Parameters.m_Eps); - InitialiseArmComputeClTensorData(m_Mean, m_Data.m_Mean->GetConstTensor()); - InitialiseArmComputeClTensorData(m_Variance, m_Data.m_Variance->GetConstTensor()); - InitialiseArmComputeClTensorData(m_Beta, m_Data.m_Beta->GetConstTensor()); - InitialiseArmComputeClTensorData(m_Gamma, m_Data.m_Gamma->GetConstTensor()); + m_Layer.configure(&input, + &output, + m_Mean.get(), + m_Variance.get(), + m_Beta.get(), + m_Gamma.get(), + m_Data.m_Parameters.m_Eps); + + InitializeArmComputeClTensorDataForFloatTypes(*m_Mean, m_Data.m_Mean); + InitializeArmComputeClTensorDataForFloatTypes(*m_Variance, m_Data.m_Variance); + InitializeArmComputeClTensorDataForFloatTypes(*m_Beta, m_Data.m_Beta); + InitializeArmComputeClTensorDataForFloatTypes(*m_Gamma, m_Data.m_Gamma); + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_Layer.prepare(); + FreeUnusedTensors(); } void ClBatchNormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClBatchNormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClBatchNormalizationFloat32Workload_Execute"); m_Layer.run(); } +void ClBatchNormalizationFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_Mean); + FreeTensorIfUnused(m_Variance); + FreeTensorIfUnused(m_Gamma); + FreeTensorIfUnused(m_Beta); +} + } //namespace armnn \ No newline at end of file diff --git a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp index ddbd0f05c0..a45614a284 100644 --- a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp @@ -10,21 +10,31 @@ namespace armnn { -class ClBatchNormalizationFloat32Workload : public Float32Workload +arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, + const BatchNormalizationDescriptor& desc); + +class ClBatchNormalizationFloat32Workload : public FloatWorkload { public: ClBatchNormalizationFloat32Workload(const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info); - using Float32Workload::Float32Workload; + using FloatWorkload::FloatWorkload; void Execute() const override; private: mutable arm_compute::CLBatchNormalizationLayer m_Layer; - arm_compute::CLTensor m_Mean; - arm_compute::CLTensor m_Variance; - arm_compute::CLTensor m_Gamma; - arm_compute::CLTensor m_Beta; + std::unique_ptr m_Mean; + std::unique_ptr m_Variance; + std::unique_ptr m_Gamma; + std::unique_ptr m_Beta; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp index 99880d68a7..58594999a8 100644 --- a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp @@ -9,7 +9,7 @@ namespace armnn void ClConstantFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConstantFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConstantFloat32Workload_Execute"); ClBaseConstantWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp index 5f86d3b2b6..11c3fda8db 100644 --- a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp @@ -9,10 +9,10 @@ namespace armnn { -class ClConstantFloat32Workload : public ClBaseConstantWorkload +class ClConstantFloat32Workload : public ClBaseConstantWorkload { public: - using ClBaseConstantWorkload::ClBaseConstantWorkload; + using ClBaseConstantWorkload::ClBaseConstantWorkload; void Execute() const override; }; diff --git a/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp index 078d4261fa..82ce436557 100644 --- a/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp @@ -9,7 +9,7 @@ namespace armnn void ClConstantUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConstantUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConstantUint8Workload_Execute"); ClBaseConstantWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp new file mode 100644 index 0000000000..4914be78bc --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp @@ -0,0 +1,64 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClConvertFp16ToFp32Workload.hpp" +#include "backends/ClTensorHandle.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; + +ClConvertFp16ToFp32Workload::ClConvertFp16ToFp32Workload( + const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info) : + Float16ToFloat32Workload(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("ClConvertFp16ToFp32Workload", 1, 1); + + arm_compute::ICLTensor& input = static_cast(this->m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast(this->m_Data.m_Outputs[0])->GetTensor(); + + m_Layer.configure(&input, &output, g_AclConvertPolicy, 0); +} + +void ClConvertFp16ToFp32Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvertFp16ToFp32Workload_Execute"); + m_Layer.run(); +} + +arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + if (input.GetDataType() != DataType::Float16) + { + *reasonIfUnsupported = "Input should be Float16"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + if (output.GetDataType() != DataType::Float32) + { + *reasonIfUnsupported = "Output should be Float32"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + const arm_compute::Status aclStatus = arm_compute::CLDepthConvertLayer::validate( + &aclInputInfo, &aclOutputInfo, g_AclConvertPolicy, 0); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported && reasonIfUnsupported) + { + *reasonIfUnsupported = aclStatus.error_description(); + } + + return aclStatus; +} + + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp new file mode 100644 index 0000000000..36ccbb7144 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +class ClConvertFp16ToFp32Workload : public Float16ToFloat32Workload +{ +public: + + ClConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + mutable arm_compute::CLDepthConvertLayer m_Layer; +}; + +arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported); + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp new file mode 100644 index 0000000000..19e064351f --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp @@ -0,0 +1,64 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClConvertFp32ToFp16Workload.hpp" +#include "backends/ClTensorHandle.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; + +ClConvertFp32ToFp16Workload::ClConvertFp32ToFp16Workload( + const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info) : + Float32ToFloat16Workload(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("ClConvertFp32ToFp16Workload", 1, 1); + + arm_compute::ICLTensor& input = static_cast(this->m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast(this->m_Data.m_Outputs[0])->GetTensor(); + + m_Layer.configure(&input, &output, g_AclConvertPolicy, 0); +} + +void ClConvertFp32ToFp16Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvertFp32ToFp16Workload_Execute"); + m_Layer.run(); +} + +arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + if (input.GetDataType() != DataType::Float32) + { + *reasonIfUnsupported = "Input should be Float32"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + if (output.GetDataType() != DataType::Float16) + { + *reasonIfUnsupported = "Output should be Float16"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + const arm_compute::Status aclStatus = arm_compute::CLDepthConvertLayer::validate( + &aclInputInfo, &aclOutputInfo, g_AclConvertPolicy, 0); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported && reasonIfUnsupported) + { + *reasonIfUnsupported = aclStatus.error_description(); + } + + return aclStatus; +} + + +} //namespace armnn \ No newline at end of file diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp new file mode 100644 index 0000000000..02a442dabc --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +class ClConvertFp32ToFp16Workload : public Float32ToFloat16Workload +{ +public: + + ClConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + mutable arm_compute::CLDepthConvertLayer m_Layer; +}; + +arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported); + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp index d7aef3d223..9ac31df5c1 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp @@ -15,13 +15,15 @@ using namespace armcomputetensorutils; ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr& memoryManager) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) , m_ConvolutionLayer(memoryManager) { - // todo: check tensor shapes match + // todo: check tensor shapes match. const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); - BuildArmComputeTensor(m_KernelTensor, weightInfo); + + m_KernelTensor = std::make_unique(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, m_Data.m_Parameters.m_StrideY, @@ -31,11 +33,10 @@ ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution m_Data.m_Parameters.m_PadBottom, arm_compute::DimensionRoundingType::FLOOR); - arm_compute::CLTensor* optionalBias = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; + m_BiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } m_Data.ValidateInputsOutputs("ClConvolution2dFloat32Workload", 1, 1); @@ -44,24 +45,35 @@ ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); m_ConvolutionLayer.configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); - InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor()); + InitializeArmComputeClTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight); - if (optionalBias) + if (m_BiasTensor) { - InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor()); + InitializeArmComputeClTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_ConvolutionLayer.prepare(); + FreeUnusedTensors(); } void ClConvolution2dFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConvolution2dFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvolution2dFloat32Workload_Execute"); m_ConvolutionLayer.run(); } +void ClConvolution2dFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp index 4cf73c89cc..51c21aec32 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp @@ -14,7 +14,7 @@ namespace armnn { -class ClConvolution2dFloat32Workload : public Float32Workload +class ClConvolution2dFloat32Workload : public FloatWorkload { public: ClConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, @@ -22,10 +22,12 @@ public: void Execute() const override; private: - mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; + mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; + std::unique_ptr m_KernelTensor; + std::unique_ptr m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp index cf419e752e..a78d7fb4a2 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp @@ -18,10 +18,11 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu : Uint8Workload(descriptor, info) , m_ConvolutionLayer(memoryManager) { - // todo: check tensor shapes match const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); - BuildArmComputeTensor(m_KernelTensor, weightInfo); + + m_KernelTensor = std::make_unique(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, m_Data.m_Parameters.m_StrideY, @@ -31,11 +32,10 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu m_Data.m_Parameters.m_PadBottom, arm_compute::DimensionRoundingType::FLOOR); - arm_compute::CLTensor* optionalBias = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; + m_BiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } m_Data.ValidateInputsOutputs("ClConvolution2dUint8Workload", 1, 1); @@ -44,25 +44,36 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); m_ConvolutionLayer.configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); - InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight->GetConstTensor()); - if (optionalBias) + if (m_BiasTensor) { - InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias->GetConstTensor()); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_ConvolutionLayer.prepare(); + FreeUnusedTensors(); } void ClConvolution2dUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConvolution2dUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvolution2dUint8Workload_Execute"); m_ConvolutionLayer.run(); } +void ClConvolution2dUint8Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp index d4d3908c80..7d9eb76ba1 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp @@ -22,10 +22,12 @@ public: void Execute() const override; private: - mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; + mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; + std::unique_ptr m_KernelTensor; + std::unique_ptr m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp new file mode 100644 index 0000000000..cfb8485039 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp @@ -0,0 +1,122 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClDepthwiseConvolutionBaseWorkload.hpp" + +#include "TypeUtils.hpp" + +#include "backends/ArmComputeUtils.hpp" +#include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ClTensorHandle.hpp" +#include "backends/CpuTensorHandle.hpp" + +namespace armnn +{ + +using namespace armcomputetensorutils; + +arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const TensorInfo& biases) +{ + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights); + + arm_compute::TensorInfo aclBiasesInfo; + arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr; + if (descriptor.m_BiasEnabled) + { + aclBiasesInfo = BuildArmComputeTensorInfo(biases); + optionalAclBiasesInfo = &aclBiasesInfo; + } + + const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor); + const unsigned int aclDepthMultiplier = weights.GetShape()[0]; + + return arm_compute::CLDepthwiseConvolutionLayer::validate(&aclInputInfo, + &aclWeightsInfo, + optionalAclBiasesInfo, + &aclOutputInfo, + aclPadStrideInfo, + aclDepthMultiplier); +} + +template +ClDepthwiseConvolutionBaseWorkload::ClDepthwiseConvolutionBaseWorkload( + const DepthwiseConvolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info) + : TypedWorkload(descriptor, info) +{ + auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); + + m_KernelTensor = std::make_unique(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); + + if (m_Data.m_Parameters.m_BiasEnabled) + { + m_BiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); + } + + arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, + m_Data.m_Parameters.m_StrideY, + m_Data.m_Parameters.m_PadLeft, + m_Data.m_Parameters.m_PadRight, + m_Data.m_Parameters.m_PadTop, + m_Data.m_Parameters.m_PadBottom, + arm_compute::DimensionRoundingType::FLOOR); + + std::string name = std::string("ClDepthwiseConvolution") + + GetDataTypeName(m_Data.m_Weight->GetTensorInfo().GetDataType()) + "Workload"; + m_Data.ValidateInputsOutputs(name, 1, 1); + + arm_compute::ICLTensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); + + const unsigned int depthMultiplier = weightInfo.GetShape()[0]; + + //Check for optimisation opportunities. + bool use3x3Optimisation = (weightInfo.GetShape()[3] == 3) && (weightInfo.GetShape()[2] == 3); + if (use3x3Optimisation) + { + m_DepthwiseConvolutionLayer = std::make_unique(); + static_cast(m_DepthwiseConvolutionLayer.get())->configure( + &input, + m_KernelTensor.get(), + m_BiasTensor.get(), + &output, + padStrideInfo, + depthMultiplier); + } + else + { + m_DepthwiseConvolutionLayer = std::make_unique(); + static_cast(m_DepthwiseConvolutionLayer.get())->configure( + &input, + m_KernelTensor.get(), + m_BiasTensor.get(), + &output, + padStrideInfo, + depthMultiplier); + } + + BOOST_ASSERT(m_DepthwiseConvolutionLayer); +} + +template +void ClDepthwiseConvolutionBaseWorkload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + +// Generate known implementations for linker +template class ClDepthwiseConvolutionBaseWorkload; +template class ClDepthwiseConvolutionBaseWorkload; + +} // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp new file mode 100644 index 0000000000..a879efc89e --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp @@ -0,0 +1,37 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const TensorInfo& biases); + +template +class ClDepthwiseConvolutionBaseWorkload : public TypedWorkload +{ +public: + using TypedWorkload::m_Data; + + ClDepthwiseConvolutionBaseWorkload(const DepthwiseConvolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); + +protected: + std::unique_ptr m_DepthwiseConvolutionLayer; + + std::unique_ptr m_KernelTensor; + std::unique_ptr m_BiasTensor; + + void FreeUnusedTensors(); +}; + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp index f31c73bc60..96d97ad4ea 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp @@ -4,8 +4,8 @@ // #include "ClDepthwiseConvolutionFloat32Workload.hpp" -#include "ClDepthwiseConvolutionHelper.hpp" -#include "backends/ClTensorHandle.hpp" + +#include "backends/ClWorkloadUtils.hpp" #include "backends/CpuTensorHandle.hpp" namespace armnn @@ -14,17 +14,25 @@ namespace armnn ClDepthwiseConvolutionFloat32Workload::ClDepthwiseConvolutionFloat32Workload( const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : ClDepthwiseConvolutionBaseWorkload(descriptor, info) { - InitClDepthwiseConvolutionWorkload(*this); + InitializeArmComputeClTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight); + + if (m_BiasTensor) + { + InitializeArmComputeClTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias); + } + + m_DepthwiseConvolutionLayer->prepare(); + FreeUnusedTensors(); } void ClDepthwiseConvolutionFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClDepthwiseConvolutionFloat32Workload_Execute"); - BOOST_ASSERT(m_pDepthwiseConvolutionLayer); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClDepthwiseConvolutionFloat32Workload_Execute"); + BOOST_ASSERT(m_DepthwiseConvolutionLayer); - m_pDepthwiseConvolutionLayer->run(); + m_DepthwiseConvolutionLayer->run(); } } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp index 8711f0c515..669fd928b5 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp @@ -5,29 +5,20 @@ #pragma once +#include "ClDepthwiseConvolutionBaseWorkload.hpp" + #include "backends/ClWorkloadUtils.hpp" namespace armnn { -class ClDepthwiseConvolutionFloat32Workload : public Float32Workload +class ClDepthwiseConvolutionFloat32Workload : public ClDepthwiseConvolutionBaseWorkload { public: ClDepthwiseConvolutionFloat32Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; - -private: - typedef float KernelDataType; - typedef float BiasDataType; - - mutable std::unique_ptr m_pDepthwiseConvolutionLayer; - - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; - - template - friend void InitClDepthwiseConvolutionWorkload(WorkloadType& workload); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp deleted file mode 100644 index cd7115773d..0000000000 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp +++ /dev/null @@ -1,91 +0,0 @@ -// -// Copyright © 2017 Arm Ltd. All rights reserved. -// See LICENSE file in the project root for full license information. -// - -#pragma once - -#include -#include "backends/ClLayerSupport.hpp" -#include "backends/ArmComputeTensorUtils.hpp" -#include "backends/ClTensorHandle.hpp" - -namespace armnn -{ - -template -void InitClDepthwiseConvolutionWorkload(WorkloadType& workload) -{ - using T = typename WorkloadType::KernelDataType; - using B = typename WorkloadType::BiasDataType; - - auto& m_Data = workload.GetData(); - auto& m_KernelTensor = workload.m_KernelTensor; - auto& m_BiasTensor = workload.m_BiasTensor; - auto& m_pDepthwiseConvolutionLayer = workload.m_pDepthwiseConvolutionLayer; - - auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); - - std::string reasonIfUnsupported; - if (!IsClDepthwiseConvolution2dDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters, weightInfo)) - { - throw UnimplementedException(reasonIfUnsupported); - } - - armcomputetensorutils::BuildArmComputeTensor(m_KernelTensor, weightInfo); - - arm_compute::CLTensor* optionalBias = nullptr; - if (m_Data.m_Parameters.m_BiasEnabled) - { - armcomputetensorutils::BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; - } - - arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, - m_Data.m_Parameters.m_StrideY, - m_Data.m_Parameters.m_PadLeft, - m_Data.m_Parameters.m_PadRight, - m_Data.m_Parameters.m_PadTop, - m_Data.m_Parameters.m_PadBottom, - arm_compute::DimensionRoundingType::FLOOR); - - std::string name = std::string("ClDepthwiseConvolution") + GetDataTypeName(GetDataType()) + "Workload"; - m_Data.ValidateInputsOutputs(name, 1, 1); - - arm_compute::ICLTensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); - arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); - - //Check for optimisation opportunities. - bool use3x3Optimisation = (weightInfo.GetShape()[3] == 3) && (weightInfo.GetShape()[2] == 3); - if (use3x3Optimisation) - { - m_pDepthwiseConvolutionLayer = std::make_unique(); - static_cast(m_pDepthwiseConvolutionLayer.get())->configure( - &input, - &m_KernelTensor, - optionalBias, - &output, - padStrideInfo); - } - else - { - m_pDepthwiseConvolutionLayer = std::make_unique(); - static_cast(m_pDepthwiseConvolutionLayer.get())->configure( - &input, - &m_KernelTensor, - optionalBias, - &output, - padStrideInfo); - } - - BOOST_ASSERT(m_pDepthwiseConvolutionLayer); - - InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->template GetConstTensor()); - - if (optionalBias) - { - InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->template GetConstTensor()); - } -} - -} //namespace armnn \ No newline at end of file diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp index 7e7c488c74..4852ce8bf9 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp @@ -4,28 +4,34 @@ // #include "ClDepthwiseConvolutionUint8Workload.hpp" -#include "ClDepthwiseConvolutionHelper.hpp" -#include "backends/ClTensorHandle.hpp" + #include "backends/CpuTensorHandle.hpp" namespace armnn { - ClDepthwiseConvolutionUint8Workload::ClDepthwiseConvolutionUint8Workload( const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : Uint8Workload(descriptor, info) + : ClDepthwiseConvolutionBaseWorkload(descriptor, info) { - InitClDepthwiseConvolutionWorkload(*this); + InitialiseArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor()); + + if (m_BiasTensor) + { + InitialiseArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias->template GetConstTensor()); + } + + m_DepthwiseConvolutionLayer->prepare(); + FreeUnusedTensors(); } void ClDepthwiseConvolutionUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClDepthwiseConvolutionUint8Workload_Execute"); - BOOST_ASSERT(m_pDepthwiseConvolutionLayer); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClDepthwiseConvolutionUint8Workload_Execute"); + BOOST_ASSERT(m_DepthwiseConvolutionLayer); - m_pDepthwiseConvolutionLayer->run(); + m_DepthwiseConvolutionLayer->run(); } } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp index ee09ff3e58..a4277d405f 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp @@ -5,29 +5,19 @@ #pragma once +#include "ClDepthwiseConvolutionBaseWorkload.hpp" + #include "backends/ClWorkloadUtils.hpp" namespace armnn { -class ClDepthwiseConvolutionUint8Workload : public Uint8Workload +class ClDepthwiseConvolutionUint8Workload : public ClDepthwiseConvolutionBaseWorkload { public: ClDepthwiseConvolutionUint8Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; - -private: - typedef uint8_t KernelDataType; - typedef int32_t BiasDataType; - - mutable std::unique_ptr m_pDepthwiseConvolutionLayer; - - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; - - template - friend void InitClDepthwiseConvolutionWorkload(WorkloadType& workload); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp index 882da50855..da71c50305 100644 --- a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn { ClFloorFloat32Workload::ClFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("ClFloorFloat32Workload", 1, 1); @@ -22,7 +22,7 @@ ClFloorFloat32Workload::ClFloorFloat32Workload(const FloorQueueDescriptor& descr void ClFloorFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClFloorFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClFloorFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp index 532dd29884..bd7f3032fc 100644 --- a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClFloorFloat32Workload : public Float32Workload +class ClFloorFloat32Workload : public FloatWorkload { public: ClFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp index 5dfab9cbbd..5014dd27ca 100644 --- a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp @@ -7,47 +7,89 @@ #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" #include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ArmComputeUtils.hpp" +#include "backends/ClLayerSupport.hpp" namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclWeights = BuildArmComputeTensorInfo(weights); + + arm_compute::TensorInfo aclBiases; + arm_compute::TensorInfo *optionalAclBiases = nullptr; + if (descriptor.m_BiasEnabled) + { + aclBiases = BuildArmComputeTensorInfo(biases); + optionalAclBiases = &aclBiases; + } + + const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo = + ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor); + + return arm_compute::CLFullyConnectedLayer::validate(&aclInput, + &aclWeights, + optionalAclBiases, + &aclOutput, + fullyConnectedLayerInfo); +} + ClFullyConnectedFloat32Workload::ClFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr& memoryManager) - : Float32Workload(descriptor, info) - , m_FullyConnected(memoryManager) + : FloatWorkload(descriptor, info) + , m_FullyConnectedLayer(memoryManager) { + m_WeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_WeightsTensor, m_Data.m_Weight->GetTensorInfo()); - BuildArmComputeTensor(m_WeightsTensor, m_Data.m_Weight->GetTensorInfo()); - - arm_compute::CLTensor* optionalBiasTensor = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBiasTensor = &m_BiasesTensor; + m_BiasesTensor = std::make_unique(); + BuildArmComputeTensor(*m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); } m_Data.ValidateInputsOutputs("ClFullyConnectedFloat32Workload", 1, 1); arm_compute::ICLTensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); + // Construct - m_FullyConnected.configure( - &input, &m_WeightsTensor, optionalBiasTensor, &output, m_Data.m_Parameters.m_TransposeWeightMatrix); + arm_compute::FullyConnectedLayerInfo fc_info; + fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix; + m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info); // Allocate - InitialiseArmComputeClTensorData(m_WeightsTensor, m_Data.m_Weight->GetConstTensor()); + InitializeArmComputeClTensorDataForFloatTypes(*m_WeightsTensor, m_Data.m_Weight); - if (optionalBiasTensor) + if (m_BiasesTensor) { - InitialiseArmComputeClTensorData(*optionalBiasTensor, m_Data.m_Bias->GetConstTensor()); + InitializeArmComputeClTensorDataForFloatTypes(*m_BiasesTensor, m_Data.m_Bias); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_FullyConnectedLayer.prepare(); + FreeUnusedTensors(); } void ClFullyConnectedFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClFullyConnectedFloat32Workload_Execute"); - m_FullyConnected.run(); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClFullyConnectedFloat32Workload_Execute"); + m_FullyConnectedLayer.run(); +} + +void ClFullyConnectedFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_WeightsTensor); + FreeTensorIfUnused(m_BiasesTensor); } } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp index c8d1227bda..f580e580c6 100644 --- a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp @@ -14,20 +14,29 @@ namespace armnn { -class ClFullyConnectedFloat32Workload : public armnn::Float32Workload +arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor); + +class ClFullyConnectedFloat32Workload : public armnn::FloatWorkload { public: ClFullyConnectedFloat32Workload(const armnn::FullyConnectedQueueDescriptor& descriptor, const armnn::WorkloadInfo& info, std::shared_ptr& memoryManager); - using armnn::Float32Workload::m_Data; + using armnn::FloatWorkload::m_Data; void Execute() const override; private: - mutable arm_compute::CLFullyConnectedLayer m_FullyConnected; - arm_compute::CLTensor m_WeightsTensor; - arm_compute::CLTensor m_BiasesTensor; + mutable arm_compute::CLFullyConnectedLayer m_FullyConnectedLayer; + + std::unique_ptr m_WeightsTensor; + std::unique_ptr m_BiasesTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp index e15db74ec9..628e38d3da 100644 --- a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp @@ -12,9 +12,21 @@ namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status ClL2NormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output); + + arm_compute::NormalizationLayerInfo normalizationInfo = + CreateAclNormalizationLayerInfoForL2Normalization(input); + + return arm_compute::CLNormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo); +} + ClL2NormalizationFloat32Workload::ClL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("ClL2NormalizationFloat32Workload", 1, 1); @@ -25,7 +37,7 @@ ClL2NormalizationFloat32Workload::ClL2NormalizationFloat32Workload(const L2Norma void ClL2NormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClL2NormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClL2NormalizationFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp index 848803e2f0..bf898e31f7 100644 --- a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp @@ -10,7 +10,10 @@ namespace armnn { -class ClL2NormalizationFloat32Workload : public Float32Workload +arm_compute::Status ClL2NormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output); + +class ClL2NormalizationFloat32Workload : public FloatWorkload { public: ClL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp new file mode 100644 index 0000000000..db5c303854 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp @@ -0,0 +1,405 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClLstmFloat32Workload.hpp" +#include "backends/ClTensorHandle.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ClLayerSupport.hpp" +#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h" + +namespace armnn +{ +using namespace armcomputetensorutils; + +ClLstmFloat32Workload::ClLstmFloat32Workload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info) + : FloatWorkload(descriptor, info) +{ + arm_compute::LSTMParams lstm_param; + + // Basic parameters + m_InputToForgetWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights->GetTensorInfo()); + + m_InputToCellWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights->GetTensorInfo()); + + m_InputToOutputWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights->GetTensorInfo()); + + m_RecurrentToForgetWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights->GetTensorInfo()); + + m_RecurrentToCellWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights->GetTensorInfo()); + + m_RecurrentToOutputWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights->GetTensorInfo()); + + m_ForgetGateBiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias->GetTensorInfo()); + + m_CellBiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_CellBiasTensor, m_Data.m_CellBias->GetTensorInfo()); + + m_OutputGateBiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias->GetTensorInfo()); + + // for future reference: check the AndroidNN API for the logic here + if (!m_Data.m_Parameters.m_CifgEnabled) + { + m_InputToInputWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights->GetTensorInfo()); + + m_RecurrentToInputWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights->GetTensorInfo()); + + m_CellToInputWeightsTensor = std::make_unique(); + if (m_Data.m_CellToInputWeights != nullptr) + { + BuildArmComputeTensor(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights->GetTensorInfo()); + } + + m_InputGateBiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_InputGateBiasTensor, m_Data.m_InputGateBias->GetTensorInfo()); + + lstm_param.set_cifg_params(m_InputToInputWeightsTensor.get(), + m_RecurrentToInputWeightsTensor.get(), + m_Data.m_CellToInputWeights != nullptr ? m_CellToInputWeightsTensor.get() : nullptr, + m_InputGateBiasTensor.get()); + } + + if (m_Data.m_Parameters.m_ProjectionEnabled) + { + m_ProjectionWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights->GetTensorInfo()); + + m_ProjectionBiasTensor = std::make_unique(); + if (m_Data.m_ProjectionBias != nullptr) + { + BuildArmComputeTensor(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias->GetTensorInfo()); + } + + lstm_param.set_projection_params(m_ProjectionWeightsTensor.get(), + m_Data.m_ProjectionBias != nullptr ? m_ProjectionBiasTensor.get() : nullptr); + } + + if (m_Data.m_Parameters.m_PeepholeEnabled) + { + m_CellToForgetWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights->GetTensorInfo()); + + m_CellToOutputWeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights->GetTensorInfo()); + + lstm_param.set_peephole_params(m_CellToForgetWeightsTensor.get(), m_CellToOutputWeightsTensor.get()); + } + + const arm_compute::ICLTensor& input = static_cast(m_Data.m_Inputs[0])->GetTensor(); + const arm_compute::ICLTensor& output_state_in = static_cast(m_Data.m_Inputs[1])->GetTensor(); + const arm_compute::ICLTensor& cell_state_in = static_cast(m_Data.m_Inputs[2])->GetTensor(); + + arm_compute::ICLTensor& output_state_out = static_cast(m_Data.m_Outputs[1])->GetTensor(); + arm_compute::ICLTensor& cell_state_out = static_cast(m_Data.m_Outputs[2])->GetTensor(); + arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[3])->GetTensor(); + + // Get the batch_size and the num_units from the cellStateIn dimensions + const TensorInfo& inputTensorInfo = info.m_InputTensorInfos[2]; + const unsigned int batch_size = boost::numeric_cast(inputTensorInfo.GetShape()[0]); + const unsigned int num_units = boost::numeric_cast(inputTensorInfo.GetShape()[1]); + + m_ScratchBuffer = std::make_unique(); + if (m_Data.m_Parameters.m_CifgEnabled) + { + // 2D tensor with dimensions [num_units * 4, batch_size] with CIFG + armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32); + BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1); + } + else + { + // scratch_buffer [num_units * 3, batch_size] without CIFG + armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32); + BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2); + } + + float cell_threshold = m_Data.m_Parameters.m_ClippingThresCell; + float projection_threshold = m_Data.m_Parameters.m_ClippingThresProj; + + // for preparing the object for the class ActivationLayerInfo, we need to consider 5 situations + arm_compute::ActivationLayerInfo activationLayerInfo; + if (m_Data.m_Parameters.m_ActivationFunc == 0) + { + // no activation, do nothing + } + else if (m_Data.m_Parameters.m_ActivationFunc == 1) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::RELU); + } + else if (m_Data.m_Parameters.m_ActivationFunc == 3) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0); + } + else if (m_Data.m_Parameters.m_ActivationFunc == 4) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0, 1.0); + } + else if (m_Data.m_Parameters.m_ActivationFunc == 6) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC); + } + else + { + throw armnn::Exception("Wrong Type of Activation Function!"); + } + + + m_LstmLayer.configure(&input, m_InputToForgetWeightsTensor.get(), m_InputToCellWeightsTensor.get(), + m_InputToOutputWeightsTensor.get(), m_RecurrentToForgetWeightsTensor.get(), + m_RecurrentToCellWeightsTensor.get(), m_RecurrentToOutputWeightsTensor.get(), + m_ForgetGateBiasTensor.get(), m_CellBiasTensor.get(), m_OutputGateBiasTensor.get(), + &output_state_in, &cell_state_in, m_ScratchBuffer.get(), &output_state_out, + &cell_state_out, &output, lstm_param, activationLayerInfo, + cell_threshold, projection_threshold); + + armcomputetensorutils::InitialiseArmComputeTensorEmpty(*m_ScratchBuffer); + + InitialiseArmComputeClTensorData(*m_InputToForgetWeightsTensor, + m_Data.m_InputToForgetWeights->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_InputToCellWeightsTensor, + m_Data.m_InputToCellWeights->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_InputToOutputWeightsTensor, + m_Data.m_InputToOutputWeights->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_RecurrentToForgetWeightsTensor, + m_Data.m_RecurrentToForgetWeights->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_RecurrentToCellWeightsTensor, + m_Data.m_RecurrentToCellWeights->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_RecurrentToOutputWeightsTensor, + m_Data.m_RecurrentToOutputWeights->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_ForgetGateBiasTensor, + m_Data.m_ForgetGateBias->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_CellBiasTensor, + m_Data.m_CellBias->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_OutputGateBiasTensor, + m_Data.m_OutputGateBias->GetConstTensor()); + + if (!m_Data.m_Parameters.m_CifgEnabled) + { + InitialiseArmComputeClTensorData(*m_InputToInputWeightsTensor, + m_Data.m_InputToInputWeights->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_RecurrentToInputWeightsTensor, + m_Data.m_RecurrentToInputWeights->GetConstTensor()); + if (m_Data.m_CellToInputWeights != nullptr) + { + InitialiseArmComputeClTensorData(*m_CellToInputWeightsTensor, + m_Data.m_CellToInputWeights->GetConstTensor()); + } + InitialiseArmComputeClTensorData(*m_InputGateBiasTensor, + m_Data.m_InputGateBias->GetConstTensor()); + } + + if (m_Data.m_Parameters.m_ProjectionEnabled) + { + InitialiseArmComputeClTensorData(*m_ProjectionWeightsTensor, + m_Data.m_ProjectionWeights->GetConstTensor()); + if (m_Data.m_ProjectionBias != nullptr) + { + InitialiseArmComputeClTensorData(*m_ProjectionBiasTensor, + m_Data.m_ProjectionBias->GetConstTensor()); + } + } + + if (m_Data.m_Parameters.m_PeepholeEnabled) + { + InitialiseArmComputeClTensorData(*m_CellToForgetWeightsTensor, + m_Data.m_CellToForgetWeights->GetConstTensor()); + InitialiseArmComputeClTensorData(*m_CellToOutputWeightsTensor, + m_Data.m_CellToOutputWeights->GetConstTensor()); + } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_LstmLayer.prepare(); + FreeUnusedTensors(); +} + +void ClLstmFloat32Workload::Execute() const +{ + m_LstmLayer.run(); +} + +arm_compute::Status ClLstmFloat32WorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, + const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, + const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, + const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, + const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, + const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, + const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, + const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights) +{ + arm_compute::LSTMParams lstm_params_info; + + // The inputs and the outputs + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputStateInInfo = BuildArmComputeTensorInfo(outputStateIn); + const arm_compute::TensorInfo aclCellStateInInfo = BuildArmComputeTensorInfo(cellStateIn); + const arm_compute::TensorInfo aclScratchBufferInfo = BuildArmComputeTensorInfo(scratchBuffer); + const arm_compute::TensorInfo aclOutputStateOutInfo = BuildArmComputeTensorInfo(outputStateOut); + const arm_compute::TensorInfo aclCellStateOutInfo = BuildArmComputeTensorInfo(cellStateOut); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + // Basic parameters + const arm_compute::TensorInfo aclInputToForgetWeightsInfo = BuildArmComputeTensorInfo(inputToForgetWeights); + const arm_compute::TensorInfo aclInputToCellWeightsInfo = BuildArmComputeTensorInfo(inputToCellWeights); + const arm_compute::TensorInfo aclInputToOutputWeightsInfo = BuildArmComputeTensorInfo(inputToOutputWeights); + const arm_compute::TensorInfo aclRecurrentToForgetWeightsInfo + = BuildArmComputeTensorInfo(recurrentToForgetWeights); + const arm_compute::TensorInfo aclRecurrentToCellWeightsInfo + = BuildArmComputeTensorInfo(recurrentToCellWeights); + const arm_compute::TensorInfo aclRecurrentToOutputWeightsInfo + = BuildArmComputeTensorInfo(recurrentToOutputWeights); + const arm_compute::TensorInfo aclForgetGateBiasInfo = BuildArmComputeTensorInfo(forgetGateBias); + const arm_compute::TensorInfo aclCellBiasInfo = BuildArmComputeTensorInfo(cellBias); + const arm_compute::TensorInfo aclOutputGateBiasInfo = BuildArmComputeTensorInfo(outputGateBias); + + arm_compute::TensorInfo aclInputToInputWeightsInfo; + arm_compute::TensorInfo aclRecurrentToInputWeightsInfo; + arm_compute::TensorInfo aclCellToInputWeightsInfo; + arm_compute::TensorInfo aclInputGateBiasInfo; + arm_compute::TensorInfo aclProjectionWeightsInfo; + arm_compute::TensorInfo aclProjectionBiasInfo; + arm_compute::TensorInfo aclCellToForgetWeightsInfo; + arm_compute::TensorInfo aclCellToOutputWeightsInfo; + + if (!descriptor.m_CifgEnabled) + { + armnn::TensorInfo inputToInputWInfo = *inputToInputWeights; + aclInputToInputWeightsInfo = BuildArmComputeTensorInfo(inputToInputWInfo); + armnn::TensorInfo recurrentToInputWInfo = *recurrentToInputWeights; + aclRecurrentToInputWeightsInfo = BuildArmComputeTensorInfo(recurrentToInputWInfo); + + if (cellToInputWeights != nullptr) + { + armnn::TensorInfo cellToInputWInfo = *cellToInputWeights; + aclCellToInputWeightsInfo = BuildArmComputeTensorInfo(cellToInputWInfo); + } + armnn::TensorInfo inputGateBiasInfo = *inputGateBias; + aclInputGateBiasInfo = BuildArmComputeTensorInfo(inputGateBiasInfo); + lstm_params_info.set_cifg_params(&aclInputToInputWeightsInfo, &aclRecurrentToInputWeightsInfo, + cellToInputWeights != nullptr ? &aclCellToInputWeightsInfo: nullptr, + &aclInputGateBiasInfo); + } + + if (descriptor.m_ProjectionEnabled) + { + const armnn::TensorInfo& projectionWInfo = *projectionWeights; + aclProjectionWeightsInfo = BuildArmComputeTensorInfo(projectionWInfo); + + if (projectionBias != nullptr) + { + const armnn::TensorInfo& projectionBiasInfo = *projectionBias; + aclProjectionBiasInfo = BuildArmComputeTensorInfo(projectionBiasInfo); + } + lstm_params_info.set_projection_params(&aclProjectionWeightsInfo, + projectionBias != nullptr ? &aclProjectionBiasInfo: nullptr); + } + + if (descriptor.m_PeepholeEnabled) + { + const armnn::TensorInfo& cellToForgetWInfo = *cellToForgetWeights; + aclCellToForgetWeightsInfo = BuildArmComputeTensorInfo(cellToForgetWInfo); + const armnn::TensorInfo& cellToOutputWInfo = *cellToOutputWeights; + aclCellToOutputWeightsInfo = BuildArmComputeTensorInfo(cellToOutputWInfo); + lstm_params_info.set_peephole_params(&aclCellToForgetWeightsInfo, &aclCellToOutputWeightsInfo); + } + + float cell_threshold = descriptor.m_ClippingThresCell; + float projection_threshold = descriptor.m_ClippingThresProj; + + // for preparing the object for the class ActivationLayerInfo, we need to consider 5 situations + arm_compute::ActivationLayerInfo activationLayerInfo; + if (descriptor.m_ActivationFunc == 0) + { + // no activation, do nothing + } + else if (descriptor.m_ActivationFunc == 1) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::RELU); + } + else if (descriptor.m_ActivationFunc == 3) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0); + } + else if (descriptor.m_ActivationFunc == 4) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0, 1.0); + } + else if (descriptor.m_ActivationFunc == 6) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC); + } + else + { + throw armnn::Exception("Wrong Type of Activation Function!"); + } + + return arm_compute::CLLSTMLayer::validate(&aclInputInfo, &aclInputToForgetWeightsInfo, + &aclInputToCellWeightsInfo, + &aclInputToOutputWeightsInfo, + &aclRecurrentToForgetWeightsInfo, + &aclRecurrentToCellWeightsInfo, + &aclRecurrentToOutputWeightsInfo, + &aclForgetGateBiasInfo, + &aclCellBiasInfo, + &aclOutputGateBiasInfo, + &aclOutputStateInInfo, &aclCellStateInInfo, + &aclScratchBufferInfo, &aclOutputStateOutInfo, + &aclCellStateOutInfo, &aclOutputInfo, + lstm_params_info, activationLayerInfo, + cell_threshold, projection_threshold); +} + +void ClLstmFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_InputToInputWeightsTensor); + FreeTensorIfUnused(m_InputToForgetWeightsTensor); + FreeTensorIfUnused(m_InputToCellWeightsTensor); + FreeTensorIfUnused(m_InputToOutputWeightsTensor); + FreeTensorIfUnused(m_RecurrentToInputWeightsTensor); + FreeTensorIfUnused(m_RecurrentToForgetWeightsTensor); + FreeTensorIfUnused(m_RecurrentToCellWeightsTensor); + FreeTensorIfUnused(m_RecurrentToOutputWeightsTensor); + FreeTensorIfUnused(m_CellToInputWeightsTensor); + FreeTensorIfUnused(m_CellToForgetWeightsTensor); + FreeTensorIfUnused(m_CellToOutputWeightsTensor); + FreeTensorIfUnused(m_InputGateBiasTensor); + FreeTensorIfUnused(m_ForgetGateBiasTensor); + FreeTensorIfUnused(m_CellBiasTensor); + FreeTensorIfUnused(m_OutputGateBiasTensor); + FreeTensorIfUnused(m_ProjectionWeightsTensor); + FreeTensorIfUnused(m_ProjectionBiasTensor); + FreeTensorIfUnused(m_ScratchBuffer); +} + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp new file mode 100644 index 0000000000..e2358ad10d --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp @@ -0,0 +1,67 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" + +namespace armnn +{ + +class ClLstmFloat32Workload : public FloatWorkload +{ +public: + ClLstmFloat32Workload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info); + void Execute() const override; + +private: + mutable arm_compute::CLLSTMLayer m_LstmLayer; + + std::unique_ptr m_InputToInputWeightsTensor; + std::unique_ptr m_InputToForgetWeightsTensor; + std::unique_ptr m_InputToCellWeightsTensor; + std::unique_ptr m_InputToOutputWeightsTensor; + std::unique_ptr m_RecurrentToInputWeightsTensor; + std::unique_ptr m_RecurrentToForgetWeightsTensor; + std::unique_ptr m_RecurrentToCellWeightsTensor; + std::unique_ptr m_RecurrentToOutputWeightsTensor; + std::unique_ptr m_CellToInputWeightsTensor; + std::unique_ptr m_CellToForgetWeightsTensor; + std::unique_ptr m_CellToOutputWeightsTensor; + std::unique_ptr m_InputGateBiasTensor; + std::unique_ptr m_ForgetGateBiasTensor; + std::unique_ptr m_CellBiasTensor; + std::unique_ptr m_OutputGateBiasTensor; + std::unique_ptr m_ProjectionWeightsTensor; + std::unique_ptr m_ProjectionBiasTensor; + + std::unique_ptr m_ScratchBuffer; + + void FreeUnusedTensors(); +}; + +arm_compute::Status ClLstmFloat32WorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor &descriptor, + const TensorInfo& inputToForgetWeights, + const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, + const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, + const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, + const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, + const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, + const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, + const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights); +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp index 4d2d708a0e..89e7690a36 100644 --- a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp @@ -11,7 +11,7 @@ namespace armnn void ClMergerFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMergerFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClMergerFloat32Workload_Execute"); ClBaseMergerWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp index 9808d30ccf..3cafa23c1e 100644 --- a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class ClMergerFloat32Workload : public ClBaseMergerWorkload +class ClMergerFloat32Workload : public ClBaseMergerWorkload { public: - using ClBaseMergerWorkload::ClBaseMergerWorkload; + using ClBaseMergerWorkload::ClBaseMergerWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp index 94a1d3c593..551135b7da 100644 --- a/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp @@ -11,7 +11,7 @@ namespace armnn void ClMergerUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMergerUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClMergerUint8Workload_Execute"); ClBaseMergerWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp index 405d109aa1..7aa33146f3 100644 --- a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp @@ -10,9 +10,29 @@ namespace armnn { +arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); + const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it, + // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be + // ignored for F32 tensors. + return arm_compute::CLPixelWiseMultiplication::validate(&aclInput1, + &aclInput2, + &aclOutput, + 1.0f, + arm_compute::ConvertPolicy::SATURATE, + arm_compute::RoundingPolicy::TO_ZERO); +} + + ClMultiplicationFloat32Workload::ClMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("ClMultiplicationFloat32Workload", 2, 1); @@ -30,9 +50,9 @@ ClMultiplicationFloat32Workload::ClMultiplicationFloat32Workload(const Multiplic void ClMultiplicationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMultiplicationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClMultiplicationFloat32Workload_Execute"); - // Execute the layer + // Executes the layer. m_PixelWiseMultiplication.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp index 8e387118e8..0d6199047d 100644 --- a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp @@ -9,12 +9,17 @@ namespace armnn { -class ClMultiplicationFloat32Workload : public Float32Workload + +arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output); + +class ClMultiplicationFloat32Workload : public FloatWorkload { public: ClMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info); - using Float32Workload::Float32Workload; + using FloatWorkload::FloatWorkload; void Execute() const override; private: diff --git a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp index a163ec2883..d23d6e11bd 100644 --- a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp @@ -27,7 +27,7 @@ arm_compute::Status ClNormalizationWorkloadValidate(const TensorInfo& input, con ClNormalizationFloat32Workload::ClNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("ClNormalizationFloat32Workload", 1, 1); @@ -42,7 +42,7 @@ ClNormalizationFloat32Workload::ClNormalizationFloat32Workload(const Normalizati void ClNormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClNormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClNormalizationFloat32Workload_Execute"); m_NormalizationLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp index cbd5fa92a9..e8ab0b9a18 100644 --- a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp @@ -14,7 +14,7 @@ arm_compute::Status ClNormalizationWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const NormalizationDescriptor& descriptor); -class ClNormalizationFloat32Workload : public Float32Workload +class ClNormalizationFloat32Workload : public FloatWorkload { public: ClNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp index 3147e95b2e..3c132cb8f8 100644 --- a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp @@ -24,10 +24,10 @@ arm_compute::Status ClPermuteWorkloadValidate(const PermuteDescriptor& descripto return arm_compute::Status{}; } -template -ClPermuteWorkload::ClPermuteWorkload(const PermuteQueueDescriptor& descriptor, +template +ClPermuteWorkload::ClPermuteWorkload(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info) - : TypedWorkload(descriptor, info) + : TypedWorkload(descriptor, info) { using armcomputetensorutils::BuildArmComputePermutationVector; @@ -37,18 +37,18 @@ ClPermuteWorkload::ClPermuteWorkload(const PermuteQueueDescriptor& des arm_compute::ICLTensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings; - // Run the layer + // Run the layer. m_PermuteFunction.configure(&input, &output, BuildArmComputePermutationVector(mappings)); } -template -void ClPermuteWorkload::Execute() const +template +void ClPermuteWorkload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, GetName() + "_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL( GetName() + "_Execute"); m_PermuteFunction.run(); } -template class ClPermuteWorkload; +template class ClPermuteWorkload; template class ClPermuteWorkload; } // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp index 430c59524e..c8726bc2c6 100644 --- a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp @@ -7,6 +7,7 @@ #include "backends/Workload.hpp" #include "backends/WorkloadData.hpp" +#include "backends/ClWorkloadUtils.hpp" #include #include @@ -18,13 +19,13 @@ namespace armnn arm_compute::Status ClPermuteWorkloadValidate(const PermuteDescriptor& descriptor); -template -class ClPermuteWorkload : public TypedWorkload +template +class ClPermuteWorkload : public TypedWorkload { public: static const std::string& GetName() { - static const std::string name = std::string("ClPermute") + GetDataTypeName(DataType) + "Workload"; + static const std::string name = std::string("ClPermuteWorkload"); return name; } @@ -32,11 +33,11 @@ public: void Execute() const override; private: - using TypedWorkload::m_Data; + using TypedWorkload::m_Data; mutable arm_compute::CLPermute m_PermuteFunction; }; -using ClPermuteFloat32Workload = ClPermuteWorkload; +using ClPermuteFloatWorkload = ClPermuteWorkload; using ClPermuteUint8Workload = ClPermuteWorkload; -} //namespace armnn +} // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp index dbdc06f174..6b8a230912 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp @@ -25,10 +25,10 @@ arm_compute::Status ClPooling2dWorkloadValidate(const TensorInfo& input, return arm_compute::CLPoolingLayer::validate(&aclInputInfo, &aclOutputInfo, layerInfo); } -template -ClPooling2dBaseWorkload::ClPooling2dBaseWorkload( +template +ClPooling2dBaseWorkload::ClPooling2dBaseWorkload( const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name) - : TypedWorkload(descriptor, info) + : TypedWorkload(descriptor, info) { m_Data.ValidateInputsOutputs(name, 1, 1); @@ -37,11 +37,11 @@ ClPooling2dBaseWorkload::ClPooling2dBaseWorkload( arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(m_Data.m_Parameters); - // Run the layer + // Run the layer. m_PoolingLayer.configure(&input, &output, layerInfo); } -template class ClPooling2dBaseWorkload; +template class ClPooling2dBaseWorkload; template class ClPooling2dBaseWorkload; } diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp index 828f000505..aea32c9e86 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp @@ -14,12 +14,12 @@ arm_compute::Status ClPooling2dWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const Pooling2dDescriptor& descriptor); -// Base class template providing an implementation of the Pooling2d layer common to all data types -template -class ClPooling2dBaseWorkload : public TypedWorkload +// Base class template providing an implementation of the Pooling2d layer common to all data types. +template +class ClPooling2dBaseWorkload : public TypedWorkload { public: - using TypedWorkload::m_Data; + using TypedWorkload::m_Data; ClPooling2dBaseWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name); diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp index a7f5855b8a..3a5b8ca526 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp @@ -10,13 +10,13 @@ namespace armnn ClPooling2dFloat32Workload::ClPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : ClPooling2dBaseWorkload(descriptor, info, "ClPooling2dFloat32Workload") + : ClPooling2dBaseWorkload(descriptor, info, "ClPooling2dFloat32Workload") { } void ClPooling2dFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClPooling2dFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClPooling2dFloat32Workload_Execute"); m_PoolingLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp index 3456a2cff8..ad189bdb52 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClPooling2dFloat32Workload : public ClPooling2dBaseWorkload +class ClPooling2dFloat32Workload : public ClPooling2dBaseWorkload { public: ClPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp index 2d2109e252..94cf753f5a 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp @@ -16,7 +16,7 @@ ClPooling2dUint8Workload::ClPooling2dUint8Workload(const Pooling2dQueueDescripto void ClPooling2dUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClPooling2dUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClPooling2dUint8Workload_Execute"); m_PoolingLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp index 7b4ad4415b..05fba222ac 100644 --- a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp @@ -11,7 +11,7 @@ namespace armnn { ClReshapeFloat32Workload::ClReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("ClReshapeFloat32Workload", 1, 1); @@ -23,7 +23,7 @@ ClReshapeFloat32Workload::ClReshapeFloat32Workload(const ReshapeQueueDescriptor& void ClReshapeFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClReshapeFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClReshapeFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp index e344ee08ad..0eb4d08da0 100644 --- a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClReshapeFloat32Workload : public Float32Workload +class ClReshapeFloat32Workload : public FloatWorkload { public: ClReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp index 36cc1dec17..050fb9aa33 100644 --- a/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp @@ -21,7 +21,7 @@ ClReshapeUint8Workload::ClReshapeUint8Workload(const ReshapeQueueDescriptor& des void ClReshapeUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClReshapeUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClReshapeUint8Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp index d71011a2e3..abef682611 100644 --- a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp @@ -14,7 +14,7 @@ namespace armnn ClResizeBilinearFloat32Workload::ClResizeBilinearFloat32Workload(const ResizeBilinearQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("ClResizeBilinearFloat32Workload", 1, 1); @@ -28,7 +28,7 @@ ClResizeBilinearFloat32Workload::ClResizeBilinearFloat32Workload(const ResizeBil void ClResizeBilinearFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClResizeBilinearFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClResizeBilinearFloat32Workload_Execute"); m_ResizeBilinearLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp index 5f70e71619..81c0566bb3 100644 --- a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClResizeBilinearFloat32Workload : public Float32Workload +class ClResizeBilinearFloat32Workload : public FloatWorkload { public: ClResizeBilinearFloat32Workload(const ResizeBilinearQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp new file mode 100644 index 0000000000..cd3107cfe1 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClSoftmaxBaseWorkload.hpp" + +#include "backends/ArmComputeTensorUtils.hpp" + +namespace armnn +{ + +arm_compute::Status ClSoftmaxWorkloadValidate(const TensorInfo& input, + const TensorInfo& output) +{ + // NOTE: We report 4D Softmax as unsupported until full support is added to ACL + if(input.GetShape().GetNumDimensions() >= 4u) + { + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, "4d softmax is not supported"); + } + + const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + return arm_compute::CLSoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo); +} + +} diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp new file mode 100644 index 0000000000..e0113134af --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp @@ -0,0 +1,16 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +arm_compute::Status ClSoftmaxWorkloadValidate(const TensorInfo& input, + const TensorInfo& output); + +} // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp index 1d05172b42..08247bc593 100644 --- a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp @@ -12,7 +12,7 @@ namespace armnn ClSoftmaxFloat32Workload::ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr& memoryManager) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) , m_SoftmaxLayer(memoryManager) { m_Data.ValidateInputsOutputs("ClSoftmaxFloat32Workload", 1, 1); @@ -24,7 +24,7 @@ ClSoftmaxFloat32Workload::ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& void ClSoftmaxFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSoftmaxFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSoftmaxFloat32Workload_Execute"); m_SoftmaxLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp index cf5c45ac6f..6cad59800b 100644 --- a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp @@ -14,7 +14,7 @@ namespace armnn { -class ClSoftmaxFloat32Workload : public Float32Workload +class ClSoftmaxFloat32Workload : public FloatWorkload { public: ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp index ee9ab4754b..3cd9a6a5ec 100644 --- a/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp @@ -33,7 +33,7 @@ ClSoftmaxUint8Workload::ClSoftmaxUint8Workload(const SoftmaxQueueDescriptor& des void ClSoftmaxUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSoftmaxUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSoftmaxUint8Workload_Execute"); m_SoftmaxLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp index 6221d56766..8a622c6caf 100644 --- a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void ClSplitterFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSplitterFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSplitterFloat32Workload_Execute"); ClBaseSplitterWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp index cfc7eaa3c2..affa9f840f 100644 --- a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class ClSplitterFloat32Workload : public ClBaseSplitterWorkload +class ClSplitterFloat32Workload : public ClBaseSplitterWorkload { public: - using ClBaseSplitterWorkload::ClBaseSplitterWorkload; + using ClBaseSplitterWorkload::ClBaseSplitterWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp index 3aa470894c..d2d25495e0 100644 --- a/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void ClSplitterUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSplitterUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSplitterUint8Workload_Execute"); ClBaseSplitterWorkload::Execute(); } diff --git a/src/armnn/backends/CpuTensorHandle.cpp b/src/armnn/backends/CpuTensorHandle.cpp index dd8176c9ec..78cf6efd2e 100644 --- a/src/armnn/backends/CpuTensorHandle.cpp +++ b/src/armnn/backends/CpuTensorHandle.cpp @@ -45,6 +45,12 @@ ScopedCpuTensorHandle::ScopedCpuTensorHandle(const ConstTensor& tensor) CopyFrom(tensor.GetMemoryArea(), tensor.GetNumBytes()); } +ScopedCpuTensorHandle::ScopedCpuTensorHandle(const ConstCpuTensorHandle& tensorHandle) +: ScopedCpuTensorHandle(tensorHandle.GetTensorInfo()) +{ + CopyFrom(tensorHandle.GetConstTensor(), tensorHandle.GetTensorInfo().GetNumBytes()); +} + ScopedCpuTensorHandle::ScopedCpuTensorHandle(const ScopedCpuTensorHandle& other) : CpuTensorHandle(other.GetTensorInfo()) { diff --git a/src/armnn/backends/CpuTensorHandle.hpp b/src/armnn/backends/CpuTensorHandle.hpp index 4bf4439083..3376650ec3 100644 --- a/src/armnn/backends/CpuTensorHandle.hpp +++ b/src/armnn/backends/CpuTensorHandle.hpp @@ -9,10 +9,12 @@ #include "OutputHandler.hpp" +#include + namespace armnn { -// Abstract tensor handle wrapping a CPU-readable region of memory, interpreting it as tensor data. +// Abstract tensor handles wrapping a CPU-readable region of memory, interpreting it as tensor data. class ConstCpuTensorHandle : public ITensorHandle { public: @@ -33,6 +35,30 @@ public: return ITensorHandle::Cpu; } + virtual void Manage() override {} + + virtual ITensorHandle* GetParent() const override { return nullptr; } + + virtual const void* Map(bool /* blocking = true */) const override { return m_Memory; } + virtual void Unmap() const override {} + + TensorShape GetStrides() const override + { + TensorShape shape(m_TensorInfo.GetShape()); + auto size = GetDataTypeSize(m_TensorInfo.GetDataType()); + auto runningSize = size; + std::vector strides(shape.GetNumDimensions()); + auto lastIdx = shape.GetNumDimensions()-1; + for (unsigned int i=0; i < lastIdx ; i++) + { + strides[lastIdx-i] = runningSize; + runningSize *= shape[lastIdx-i]; + } + strides[0] = runningSize; + return TensorShape(shape.GetNumDimensions(), strides.data()); + } + TensorShape GetShape() const override { return m_TensorInfo.GetShape(); } + protected: ConstCpuTensorHandle(const TensorInfo& tensorInfo); @@ -46,7 +72,7 @@ private: const void* m_Memory; }; -// Abstract specialization of ConstCpuTensorHandle that allows write access to the same data +// Abstract specialization of ConstCpuTensorHandle that allows write access to the same data. class CpuTensorHandle : public ConstCpuTensorHandle { public: @@ -79,9 +105,12 @@ class ScopedCpuTensorHandle : public CpuTensorHandle public: explicit ScopedCpuTensorHandle(const TensorInfo& tensorInfo); - // Copies contents from Tensor + // Copies contents from Tensor. explicit ScopedCpuTensorHandle(const ConstTensor& tensor); + // Copies contents from ConstCpuTensorHandle + explicit ScopedCpuTensorHandle(const ConstCpuTensorHandle& tensorHandle); + ScopedCpuTensorHandle(const ScopedCpuTensorHandle& other); ScopedCpuTensorHandle& operator=(const ScopedCpuTensorHandle& other); ~ScopedCpuTensorHandle(); @@ -98,7 +127,7 @@ private: // Clients must make sure the passed in memory region stays alive for the lifetime of // the PassthroughCpuTensorHandle instance. // -// Note there is no polymorphism to/from ConstPassthroughCpuTensorHandle +// Note there is no polymorphism to/from ConstPassthroughCpuTensorHandle. class PassthroughCpuTensorHandle : public CpuTensorHandle { public: @@ -117,7 +146,7 @@ public: // Clients must make sure the passed in memory region stays alive for the lifetime of // the PassthroughCpuTensorHandle instance. // -// Note there is no polymorphism to/from PassthroughCpuTensorHandle +// Note there is no polymorphism to/from PassthroughCpuTensorHandle. class ConstPassthroughCpuTensorHandle : public ConstCpuTensorHandle { public: @@ -131,7 +160,7 @@ public: }; -// template specializations +// Template specializations. template <> const void* ConstCpuTensorHandle::GetConstTensor() const; diff --git a/src/armnn/backends/ITensorHandle.hpp b/src/armnn/backends/ITensorHandle.hpp index b95dcc65e0..ab571ab305 100644 --- a/src/armnn/backends/ITensorHandle.hpp +++ b/src/armnn/backends/ITensorHandle.hpp @@ -7,6 +7,8 @@ namespace armnn { +class TensorShape; + class ITensorHandle { public: @@ -18,8 +20,54 @@ public: }; virtual ~ITensorHandle(){} + + /// Indicate to the memory manager that this resource is active. + /// This is used to compute overlapping lifetimes of resources. + virtual void Manage() = 0; + + /// Indicate to the memory manager that this resource is no longer active. + /// This is used to compute overlapping lifetimes of resources. virtual void Allocate() = 0; + + /// Get the type backend associated with the tensor handle. + /// \return Type enum virtual ITensorHandle::Type GetType() const = 0; + + /// Get the parent tensor if this is a subtensor. + /// \return a pointer to the parent tensor. Otherwise nullptr if not a subtensor. + virtual ITensorHandle* GetParent() const = 0; + + /// Map the tensor data for access. + /// \param blocking hint to block the calling thread until all other accesses are complete. (backend dependent) + /// \return pointer to the first element of the mapped data. + virtual const void* Map(bool blocking=true) const = 0; + + /// Unmap the tensor data + virtual void Unmap() const = 0; + + /// Map the tensor data for access. Must be paired with call to Unmap(). + /// \param blocking hint to block the calling thread until all other accesses are complete. (backend dependent) + /// \return pointer to the first element of the mapped data. + void* Map(bool blocking=true) + { + return const_cast(static_cast(this)->Map(blocking)); + } + + /// Unmap the tensor data that was previously mapped with call to Map(). + void Unmap() + { + return static_cast(this)->Unmap(); + } + + /// Get the strides for each dimension ordered from largest to smallest where + /// the smallest value is the same as the size of a single element in the tensor. + /// \return a TensorShape filled with the strides for each dimension + virtual TensorShape GetStrides() const = 0; + + /// Get the number of elements for each dimension orderd from slowest iterating dimension + /// to fastest iterating dimension. + /// \return a TensorShape filled with the number of elements for each dimension. + virtual TensorShape GetShape() const = 0; }; } diff --git a/src/armnn/backends/MakeWorkloadHelper.hpp b/src/armnn/backends/MakeWorkloadHelper.hpp index a1f9b0b0eb..64a7f8983b 100644 --- a/src/armnn/backends/MakeWorkloadHelper.hpp +++ b/src/armnn/backends/MakeWorkloadHelper.hpp @@ -9,7 +9,7 @@ namespace armnn namespace { -// Make a workload of the specified WorkloadType +// Make a workload of the specified WorkloadType. template struct MakeWorkloadForType { @@ -37,7 +37,8 @@ struct MakeWorkloadForType // Makes a workload for one the specified types based on the data type requirements of the tensorinfo. // Specify type void as the WorkloadType for unsupported DataType/WorkloadType combos. -template +template std::unique_ptr MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info, Args&&... args) { const DataType dataType = !info.m_InputTensorInfos.empty() ? @@ -49,6 +50,8 @@ std::unique_ptr MakeWorkload(const QueueDescriptorType& descriptor, c switch (dataType) { + case DataType::Float16: + return MakeWorkloadForType::Func(descriptor, info, std::forward(args)...); case DataType::Float32: return MakeWorkloadForType::Func(descriptor, info, std::forward(args)...); case DataType::QuantisedAsymm8: @@ -59,5 +62,17 @@ std::unique_ptr MakeWorkload(const QueueDescriptorType& descriptor, c } } +// Makes a workload for one the specified types based on the data type requirements of the tensorinfo. +// Calling this method is the equivalent of calling the three typed MakeWorkload method with . +// Specify type void as the WorkloadType for unsupported DataType/WorkloadType combos. +template +std::unique_ptr MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info, Args&&... args) +{ + return MakeWorkload(descriptor, info, + std::forward(args)...); +} + + } //namespace } //namespace armnn diff --git a/src/armnn/backends/MemCopyWorkload.cpp b/src/armnn/backends/MemCopyWorkload.cpp index 09ffd9a08a..27e60f93b7 100644 --- a/src/armnn/backends/MemCopyWorkload.cpp +++ b/src/armnn/backends/MemCopyWorkload.cpp @@ -4,14 +4,7 @@ // #include "MemCopyWorkload.hpp" #include "backends/CpuTensorHandle.hpp" - -#if ARMCOMPUTECL_ENABLED -#include "backends/ClTensorHandle.hpp" -#endif - -#if ARMCOMPUTENEON_ENABLED -#include "backends/NeonTensorHandle.hpp" -#endif +#include "TypeUtils.hpp" #include #include @@ -26,7 +19,7 @@ template void GatherTensorHandlePairs(const MemCopyQueueDescriptor& descriptor, std::vector>& tensorHandlePairs) { - const unsigned int numInputs = boost::numeric_cast(descriptor.m_Inputs.size()); + const unsigned int numInputs = static_cast(descriptor.m_Inputs.size()); tensorHandlePairs.reserve(numInputs); for (unsigned int i = 0; i < numInputs; ++i) @@ -40,217 +33,29 @@ void GatherTensorHandlePairs(const MemCopyQueueDescriptor& descriptor, } } -void CopyFromCpuToCpu(const ConstCpuTensorHandle& srcHandle, CpuTensorHandle& dstHandle) -{ - const unsigned int numBytes = srcHandle.GetTensorInfo().GetNumBytes(); - const void* const input = srcHandle.GetConstTensor(); - void* const output = dstHandle.GetTensor(); - std::memcpy(output, input, numBytes); -} - -#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED - -#include "backends/ArmComputeTensorUtils.hpp" - -template -void CopyFromCpuToAclBackend(const ConstCpuTensorHandle& srcHandle, arm_compute::ITensor& dstAclTensor) -{ - using T = ResolveType; - armnn::armcomputetensorutils::CopyArmComputeITensorData(srcHandle.GetConstTensor(), dstAclTensor); -} - -template -void CopyFromAclBackendToCpu(const arm_compute::ITensor& srcAclTensor, CpuTensorHandle& dstHandle) -{ - using T = ResolveType; - armnn::armcomputetensorutils::CopyArmComputeITensorData(srcAclTensor, dstHandle.GetTensor()); -} - -#endif // ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED - -} - -template -CopyFromCpuToCpuWorkload::CopyFromCpuToCpuWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template -void CopyFromCpuToCpuWorkload::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "CopyFromCpuToCpuWorkload_Execute"); - - for (const auto& pair : m_TensorHandlePairs) - { - CopyFromCpuToCpu(*pair.first, *pair.second); - } -} - -template class CopyFromCpuToCpuWorkload; -template class CopyFromCpuToCpuWorkload; - -#if ARMCOMPUTECL_ENABLED - -template -CopyFromCpuToClWorkload::CopyFromCpuToClWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template -void CopyFromCpuToClWorkload::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromCpuToClWorkload_Execute"); - - for (const auto& pair : m_TensorHandlePairs) - { - IClTensorHandle& handle = *pair.second; - - handle.Map(true); - CopyFromCpuToAclBackend(*pair.first, handle.GetTensor()); - handle.UnMap(); - } -} - -template class CopyFromCpuToClWorkload; -template class CopyFromCpuToClWorkload; - - -template -CopyFromClToCpuWorkload::CopyFromClToCpuWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template -void CopyFromClToCpuWorkload::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromClToCpuWorkload_Execute"); - - for (const auto& pair : m_TensorHandlePairs) - { - IClTensorHandle& handle = *pair.first; - - handle.Map(true); - CopyFromAclBackendToCpu(handle.GetTensor(), *pair.second); - handle.UnMap(); - } -} - -template class CopyFromClToCpuWorkload; -template class CopyFromClToCpuWorkload; - -#endif // ARMCOMPUTECL_ENABLED +} //namespace -#if ARMCOMPUTENEON_ENABLED -template -CopyFromCpuToNeonWorkload::CopyFromCpuToNeonWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload(descriptor, info) +CopyMemGenericWorkload::CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor, + const WorkloadInfo& info) + : BaseWorkload(descriptor, info) { GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); } -template -void CopyFromCpuToNeonWorkload::Execute() const +void CopyMemGenericWorkload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "CopyFromCpuToNeonWorkload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CopyMemGeneric_Execute"); - for (const auto& pair : m_TensorHandlePairs) - { - CopyFromCpuToAclBackend(*pair.first, pair.second->GetTensor()); - } -} - -template class CopyFromCpuToNeonWorkload; -template class CopyFromCpuToNeonWorkload; - -template -CopyFromNeonToCpuWorkload::CopyFromNeonToCpuWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template -void CopyFromNeonToCpuWorkload::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "CopyFromNeonToCpuWorkload_Execute"); + auto copyFunc = [](void* dst, const void* src, size_t size) + { + memcpy(dst, src, size); + }; for (const auto& pair : m_TensorHandlePairs) { - CopyFromAclBackendToCpu(pair.first->GetTensor(), *pair.second); + CopyTensorContentsGeneric(pair.first, pair.second, copyFunc); } } -template class CopyFromNeonToCpuWorkload; -template class CopyFromNeonToCpuWorkload; - -#endif // ARMCOMPUTENEON_ENABLED - -#if ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED - -template -CopyFromNeonToClWorkload::CopyFromNeonToClWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template -void CopyFromNeonToClWorkload::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromNeonToClWorkload_Execute"); - - for (const auto& pair : m_TensorHandlePairs) - { - IClTensorHandle& handle = *pair.second; - - handle.Map(true); - handle.GetTensor().copy_from(pair.first->GetTensor()); - handle.UnMap(); - } -} - -template class CopyFromNeonToClWorkload; -template class CopyFromNeonToClWorkload; - -template -CopyFromClToNeonWorkload::CopyFromClToNeonWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template -void CopyFromClToNeonWorkload::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromClToNeonWorkload_Execute"); - - for (const auto& pair : m_TensorHandlePairs) - { - IClTensorHandle& handle = *pair.first; - - handle.Map(true); - pair.second->GetTensor().copy_from(handle.GetTensor()); - handle.UnMap(); - } -} - -template class CopyFromClToNeonWorkload; -template class CopyFromClToNeonWorkload; - -#endif // ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED - -} +} //namespace armnn diff --git a/src/armnn/backends/MemCopyWorkload.hpp b/src/armnn/backends/MemCopyWorkload.hpp index 7fcaf138c3..7a46e5b2ef 100644 --- a/src/armnn/backends/MemCopyWorkload.hpp +++ b/src/armnn/backends/MemCopyWorkload.hpp @@ -6,131 +6,21 @@ #include "CpuTensorHandleFwd.hpp" #include "backends/Workload.hpp" - +#include "WorkloadUtils.hpp" #include namespace armnn { -template -class CopyFromCpuToCpuWorkload : public TypedWorkload -{ -public: - CopyFromCpuToCpuWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -private: - using TensorHandlePair = std::pair; - std::vector m_TensorHandlePairs; -}; - -using CopyFromCpuToCpuFloat32Workload = CopyFromCpuToCpuWorkload; -using CopyFromCpuToCpuUint8Workload = CopyFromCpuToCpuWorkload; - -#if ARMCOMPUTECL_ENABLED - -class IClTensorHandle; - -template -class CopyFromCpuToClWorkload : public TypedWorkload -{ -public: - CopyFromCpuToClWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -private: - using TensorHandlePair = std::pair; - std::vector m_TensorHandlePairs; -}; - -using CopyFromCpuToClFloat32Workload = CopyFromCpuToClWorkload; -using CopyFromCpuToClUint8Workload = CopyFromCpuToClWorkload; - -template -class CopyFromClToCpuWorkload : public TypedWorkload -{ -public: - CopyFromClToCpuWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -private: - using TensorHandlePair = std::pair; - std::vector m_TensorHandlePairs; -}; - -using CopyFromClToCpuFloat32Workload = CopyFromClToCpuWorkload; -using CopyFromClToCpuUint8Workload = CopyFromClToCpuWorkload; - -#endif // ARMCOMPUTECL_ENABLED - -#if ARMCOMPUTENEON_ENABLED - -class INeonTensorHandle; - -template -class CopyFromCpuToNeonWorkload : public TypedWorkload -{ -public: - CopyFromCpuToNeonWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -protected: - using TensorHandlePair = std::pair; - std::vector m_TensorHandlePairs; -}; - -using CopyFromCpuToNeonFloat32Workload = CopyFromCpuToNeonWorkload; -using CopyFromCpuToNeonUint8Workload = CopyFromCpuToNeonWorkload; - -template -class CopyFromNeonToCpuWorkload : public TypedWorkload +class CopyMemGenericWorkload : public BaseWorkload { public: - CopyFromNeonToCpuWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -protected: - using TensorHandlePair = std::pair; - std::vector m_TensorHandlePairs; -}; - -using CopyFromNeonToCpuFloat32Workload = CopyFromNeonToCpuWorkload; -using CopyFromNeonToCpuUint8Workload = CopyFromNeonToCpuWorkload; - -#endif - -#if ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED - -template -class CopyFromNeonToClWorkload : public TypedWorkload -{ -public: - CopyFromNeonToClWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); + CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; private: - using TensorHandlePair = std::pair; + using TensorHandlePair = std::pair; std::vector m_TensorHandlePairs; }; -using CopyFromNeonToClFloat32Workload = CopyFromNeonToClWorkload; -using CopyFromNeonToClUint8Workload = CopyFromNeonToClWorkload; - -template -class CopyFromClToNeonWorkload : public TypedWorkload -{ -public: - CopyFromClToNeonWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -private: - using TensorHandlePair = std::pair; - std::vector m_TensorHandlePairs; -}; - -using CopyFromClToNeonFloat32Workload = CopyFromClToNeonWorkload; -using CopyFromClToNeonUint8Workload = CopyFromClToNeonWorkload; - -#endif - -} +} //namespace armnn diff --git a/src/armnn/backends/NeonLayerSupport.cpp b/src/armnn/backends/NeonLayerSupport.cpp index bfc84bd086..3aef4e60aa 100644 --- a/src/armnn/backends/NeonLayerSupport.cpp +++ b/src/armnn/backends/NeonLayerSupport.cpp @@ -15,34 +15,29 @@ #include #ifdef ARMCOMPUTENEON_ENABLED +#include "NeonWorkloads/NeonAdditionFloat32Workload.hpp" +#include "NeonWorkloads/NeonActivationFloat32Workload.hpp" +#include "NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp" #include "NeonWorkloads/NeonConvolution2dBaseWorkload.hpp" -#include "NeonWorkloads/NeonPooling2dBaseWorkload.hpp" +#include "NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp" +#include "NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp" +#include "NeonWorkloads/NeonMultiplicationFloat32Workload.hpp" +#include "NeonWorkloads/NeonNormalizationFloat32Workload.hpp" +#include "NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp" #include "NeonWorkloads/NeonPermuteWorkload.hpp" +#include "NeonWorkloads/NeonPooling2dBaseWorkload.hpp" +#include "NeonWorkloads/NeonSoftmaxBaseWorkload.hpp" #endif using namespace boost; namespace armnn { -bool IsNeonActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters) -{ - if (parameters.m_Function != ActivationFunction::BoundedReLu) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "Unsupported activation function, only BoundedReLu is supported)"; - } - - return false; - } - - return true; -} bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc) { // See arm_compute::NEDirectConvolutionLayer documentation for the supported cases, - // and complement with NEDirectConvolutionLayerKernel::configure() implementation + // and complement with NEDirectConvolutionLayerKernel::configure() implementation. // Only 1x1 is using direct convolution. Performance results and details are in: // https://jira.arm.com/browse/IVGCVSW-1003 @@ -60,15 +55,15 @@ bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convol conv2ddesc.m_PadTop > value || conv2ddesc.m_PadBottom > value; }; - // Supported sizes and padding + // Supported sizes and padding. const bool sizeAndPaddingSupported = - // Pad > 0 not supported for 1x1 weights + // Pad > 0 not supported for 1x1 weights. (weightInfo.GetShape()[2] == 1 && weightInfo.GetShape()[3] == 1 && !paddingLargerThan(desc, 0u)); const bool preferDirectConvolution = dataTypeSupported && strideSupported && sizeAndPaddingSupported && - // NEDirectConvolutionLayerKernel doesn't support NULL bias + // NEDirectConvolutionLayerKernel doesn't support NULL bias. desc.m_BiasEnabled; return preferDirectConvolution; } @@ -108,10 +103,10 @@ bool IsNeonBackendSupported(std::string* reasonIfUnsupported) #endif } -template +template bool IsSupportedForDataTypeNeon(std::string* reasonIfUnsupported, DataType dataType, - Float32Func floatFuncPtr, + FloatFunc floatFuncPtr, Uint8Func uint8FuncPtr, Params&&... params) { @@ -119,6 +114,7 @@ bool IsSupportedForDataTypeNeon(std::string* reasonIfUnsupported, IsSupportedForDataTypeGeneric(reasonIfUnsupported, dataType, floatFuncPtr, + floatFuncPtr, uint8FuncPtr, std::forward(params)...); } @@ -144,43 +140,16 @@ inline bool IsWorkloadSupported(FuncType& func, std::string* reasonIfUnsupported #endif bool IsActivationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported) { ignore_unused(descriptor); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc, - &IsNeonActivationUint8Supported, - descriptor); -} - -bool IsNeonDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported, - const DepthwiseConvolution2dDescriptor& parameters, - const TensorInfo& weights) -{ - ignore_unused(weights); - - if (parameters.m_StrideX < 1 || parameters.m_StrideX > 3) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "m_StrideX can only be 1, 2 or 3"; - } - return false; - } - - // weights.GetShape()[0] = channel multiplier - if (weights.GetShape()[0] != 1) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "Channel multiplier only supports the value 1 in the NEON backend"; - } - return false; - } - - return true; + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonActivationWorkloadValidate, + reasonIfUnsupported, + input, + output, + descriptor); } bool IsAdditionSupportedNeon(const TensorInfo& input0, @@ -188,23 +157,31 @@ bool IsAdditionSupportedNeon(const TensorInfo& input0, const TensorInfo& output, std::string* reasonIfUnsupported) { - ignore_unused(input1); - ignore_unused(output); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input0.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonAdditionWorkloadValidate, + reasonIfUnsupported, + input0, + input1, + output); } bool IsBatchNormalizationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported) { - ignore_unused(descriptor); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonBatchNormalizationValidate, + reasonIfUnsupported, + input, + output, + mean, + var, + beta, + gamma, + descriptor); } bool IsConstantSupportedNeon(const TensorInfo& output, @@ -233,27 +210,40 @@ bool IsConvolution2dSupportedNeon(const TensorInfo& input, } bool IsDepthwiseConvolutionSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &IsNeonDepthwiseConvolution2dDescParamsSupported, - &IsNeonDepthwiseConvolution2dDescParamsSupported, - descriptor, - weights); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonDepthwiseConvolutionWorkloadValidate, + reasonIfUnsupported, + input, + output, + descriptor, + weights, + biases); } bool IsFullyConnectedSupportedNeon(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported) { - ignore_unused(descriptor); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + // At the moment U8 is unsupported + if (input.GetDataType() == DataType::QuantisedAsymm8) + { + return false; + } + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonFullyConnectedWorkloadValidate, + reasonIfUnsupported, + input, + output, + weights, + biases, + descriptor); } bool IsInputSupportedNeon(const TensorInfo& input, @@ -266,12 +256,10 @@ bool IsInputSupportedNeon(const TensorInfo& input, } bool IsL2NormalizationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFunc<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonL2NormalizationWorkloadValidate, reasonIfUnsupported, input, output); } bool IsMergerSupportedNeon(const std::vector inputs, @@ -287,13 +275,14 @@ bool IsMergerSupportedNeon(const std::vector inputs, bool IsMultiplicationSupportedNeon(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported) { - ignore_unused(input1); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input0.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonMultiplicationWorkloadValidate, + reasonIfUnsupported, + input0, + input1, + output); } bool IsNormalizationSupportedNeon(const TensorInfo& input, @@ -301,11 +290,7 @@ bool IsNormalizationSupportedNeon(const TensorInfo& input, const NormalizationDescriptor& descriptor, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &IsNeonNormalizationDescParamsSupported, - &FalseFuncU8, - descriptor); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonNormalizationWorkloadValidate, reasonIfUnsupported, input, output, descriptor); } bool IsOutputSupportedNeon(const TensorInfo& output, @@ -341,14 +326,11 @@ bool IsResizeBilinearSupportedNeon(const TensorInfo& input, } bool IsSoftmaxSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported) { - ignore_unused(descriptor); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &TrueFunc<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonSoftmaxWorkloadValidate, reasonIfUnsupported, input, output, descriptor); } bool IsSplitterSupportedNeon(const TensorInfo& input, @@ -385,10 +367,72 @@ bool IsFloorSupportedNeon(const TensorInfo& input, std::string* reasonIfUnsupported) { ignore_unused(output); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + return IsNeonBackendSupported(reasonIfUnsupported) && + IsSupportedForDataTypeGeneric(reasonIfUnsupported, + input.GetDataType(), + &FalseFuncF16<>, + &TrueFunc<>, + &FalseFuncU8<>); +} + +bool IsLstmSupportedNeon(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported) +{ + ignore_unused(input); + ignore_unused(outputStateIn); + ignore_unused(cellStateIn); + ignore_unused(scratchBuffer); + ignore_unused(outputStateOut); + ignore_unused(cellStateOut); + ignore_unused(output); + ignore_unused(descriptor); + ignore_unused(inputToForgetWeights); + ignore_unused(inputToCellWeights); + ignore_unused(inputToOutputWeights); + ignore_unused(recurrentToForgetWeights); + ignore_unused(recurrentToCellWeights); + ignore_unused(recurrentToOutputWeights); + ignore_unused(forgetGateBias); + ignore_unused(cellBias); + ignore_unused(outputGateBias); + ignore_unused(inputToInputWeights); + ignore_unused(recurrentToInputWeights); + ignore_unused(cellToInputWeights); + ignore_unused(inputGateBias); + ignore_unused(projectionWeights); + ignore_unused(projectionBias); + ignore_unused(cellToForgetWeights); + ignore_unused(cellToOutputWeights); + return false; +} + +bool IsConvertFp16ToFp32SupportedNeon(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + ignore_unused(input); + ignore_unused(output); + return true; +} + +bool IsConvertFp32ToFp16SupportedNeon(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + ignore_unused(input); + ignore_unused(output); + return true; } } diff --git a/src/armnn/backends/NeonLayerSupport.hpp b/src/armnn/backends/NeonLayerSupport.hpp index ce2ecec459..6f9fe9c20e 100644 --- a/src/armnn/backends/NeonLayerSupport.hpp +++ b/src/armnn/backends/NeonLayerSupport.hpp @@ -11,14 +11,13 @@ namespace armnn { -bool IsNeonActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters); - bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc); bool IsNeonNormalizationDescParamsSupported(std::string* reasonIfUnsupported, const NormalizationDescriptor& parameters); bool IsActivationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported); @@ -32,6 +31,11 @@ bool IsAdditionSupportedNeon(const TensorInfo& input0, std::string* reasonIfUnsupported); bool IsBatchNormalizationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -45,12 +49,18 @@ bool IsConvolution2dSupportedNeon(const TensorInfo& input, const TensorInfo& biases, std::string* reasonIfUnsupported = nullptr); + bool IsDepthwiseConvolutionSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported = nullptr); bool IsFullyConnectedSupportedNeon(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -58,6 +68,7 @@ bool IsInputSupportedNeon(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsL2NormalizationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); bool IsMergerSupportedNeon(const std::vector inputs, @@ -66,6 +77,7 @@ bool IsMergerSupportedNeon(const std::vector inputs, bool IsMultiplicationSupportedNeon(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); bool IsNormalizationSupportedNeon(const TensorInfo& input, @@ -90,6 +102,7 @@ bool IsResizeBilinearSupportedNeon(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsSoftmaxSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -108,4 +121,26 @@ bool IsFloorSupportedNeon(const TensorInfo& input, const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); +bool IsLstmSupportedNeon(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr); + +bool IsConvertFp16ToFp32SupportedNeon(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + +bool IsConvertFp32ToFp16SupportedNeon(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + } diff --git a/src/armnn/backends/NeonTensorHandle.hpp b/src/armnn/backends/NeonTensorHandle.hpp index 684a5e1bfc..3818d2c9b2 100644 --- a/src/armnn/backends/NeonTensorHandle.hpp +++ b/src/armnn/backends/NeonTensorHandle.hpp @@ -7,11 +7,14 @@ #include "OutputHandler.hpp" #include "ArmComputeTensorUtils.hpp" +#include +#include #include #include #include #include +#include namespace armnn { @@ -22,6 +25,7 @@ public: virtual arm_compute::ITensor& GetTensor() = 0; virtual arm_compute::ITensor const& GetTensor() const = 0; virtual arm_compute::DataType GetDataType() const = 0; + virtual void SetMemoryGroup(const std::shared_ptr& memoryGroup) = 0; }; class NeonTensorHandle : public INeonTensorHandle @@ -34,47 +38,100 @@ public: arm_compute::ITensor& GetTensor() override { return m_Tensor; } arm_compute::ITensor const& GetTensor() const override { return m_Tensor; } + virtual void Allocate() override { armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor); }; + virtual void Manage() override + { + BOOST_ASSERT(m_MemoryGroup != nullptr); + m_MemoryGroup->manage(&m_Tensor); + } + virtual ITensorHandle::Type GetType() const override { return ITensorHandle::Neon; } + virtual ITensorHandle* GetParent() const override { return nullptr; } + virtual arm_compute::DataType GetDataType() const override { return m_Tensor.info()->data_type(); } + virtual void SetMemoryGroup(const std::shared_ptr& memoryGroup) override + { + m_MemoryGroup = boost::polymorphic_pointer_downcast(memoryGroup); + } + + virtual const void* Map(bool /* blocking = true */) const override + { + return static_cast(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + virtual void Unmap() const override {} + + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } + private: arm_compute::Tensor m_Tensor; + std::shared_ptr m_MemoryGroup; }; class NeonSubTensorHandle : public INeonTensorHandle { public: - NeonSubTensorHandle(arm_compute::ITensor& parent, - const arm_compute::TensorShape& shape, - const arm_compute::Coordinates& coords) - : m_Tensor(&parent, shape, coords) + NeonSubTensorHandle(INeonTensorHandle* parent, + const arm_compute::TensorShape& shape, + const arm_compute::Coordinates& coords) + : m_Tensor(&parent->GetTensor(), shape, coords) { + parentHandle = parent; } arm_compute::ITensor& GetTensor() override { return m_Tensor; } arm_compute::ITensor const& GetTensor() const override { return m_Tensor; } - virtual void Allocate() override - { - }; + + virtual void Allocate() override {} + virtual void Manage() override {} virtual ITensorHandle::Type GetType() const override { return ITensorHandle::Neon; } + virtual ITensorHandle* GetParent() const override { return parentHandle; } + virtual arm_compute::DataType GetDataType() const override { return m_Tensor.info()->data_type(); } + virtual void SetMemoryGroup(const std::shared_ptr&) override {} + + virtual const void* Map(bool /* blocking = true */) const override + { + return static_cast(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + virtual void Unmap() const override {} + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } private: - arm_compute::SubTensor m_Tensor; + arm_compute::SubTensor m_Tensor; + ITensorHandle* parentHandle = nullptr; }; } diff --git a/src/armnn/backends/NeonWorkloadFactory.cpp b/src/armnn/backends/NeonWorkloadFactory.cpp index a17988de5a..6ea72f77cc 100644 --- a/src/armnn/backends/NeonWorkloadFactory.cpp +++ b/src/armnn/backends/NeonWorkloadFactory.cpp @@ -9,10 +9,13 @@ #ifdef ARMCOMPUTENEON_ENABLED #include "arm_compute/runtime/Allocator.h" + #include "MemCopyWorkload.hpp" #include "NeonTensorHandle.hpp" #include "NeonWorkloadUtils.hpp" #include "NeonWorkloads.hpp" + +#include "memory/IPoolManager.hpp" #endif #include "MakeWorkloadHelper.hpp" @@ -22,7 +25,8 @@ namespace armnn { -bool NeonWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported) +bool NeonWorkloadFactory::IsLayerSupported(const Layer& layer, boost::optional dataType, + std::string& outReasonIfUnsupported) { return IWorkloadFactory::IsLayerSupported(Compute::CpuAcc, layer, dataType, outReasonIfUnsupported); } @@ -30,7 +34,7 @@ bool NeonWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType #ifdef ARMCOMPUTENEON_ENABLED NeonWorkloadFactory::NeonWorkloadFactory() -: m_MemoryManager(std::make_unique()) + : m_MemoryManager(std::make_unique(), BaseMemoryManager::MemoryAffinity::Offset) { } @@ -46,30 +50,33 @@ std::unique_ptr NeonWorkloadFactory::CreateSubTensorHandle(ITenso coords.set_num_dimensions(subTensorShape.GetNumDimensions()); for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); i++) { - // arm compute indexes tensor coords in reverse order + // Arm compute indexes tensor coords in reverse order. unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1; coords.set(i, boost::numeric_cast(subTensorOrigin[revertedIndex])); } - return std::make_unique(boost::polymorphic_downcast(&parent)->GetTensor(), - shape, coords); + return std::make_unique( + boost::polymorphic_downcast(&parent), shape, coords); } std::unique_ptr NeonWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const { - return std::make_unique(tensorInfo); + auto tensorHandle = std::make_unique(tensorInfo); + tensorHandle->SetMemoryGroup(m_MemoryManager.GetInterLayerMemoryGroup()); + + return tensorHandle; } std::unique_ptr NeonWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info); + return MakeWorkload(descriptor, info); } std::unique_ptr NeonWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info); + return MakeWorkload(descriptor, info); } std::unique_ptr NeonWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor, @@ -82,7 +89,7 @@ std::unique_ptr NeonWorkloadFactory::CreateSoftmax(const SoftmaxQueue const WorkloadInfo& info) const { return MakeWorkload(descriptor, info, - m_MemoryManager.Get()); + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr NeonWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor, @@ -100,13 +107,14 @@ std::unique_ptr NeonWorkloadFactory::CreateMerger(const Merger std::unique_ptr NeonWorkloadFactory::CreateFullyConnected( const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info, m_MemoryManager.Get()); + return MakeWorkload(descriptor, info, + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr NeonWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info); + return MakeWorkload(descriptor, info); } std::unique_ptr NeonWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor, @@ -119,7 +127,7 @@ std::unique_ptr NeonWorkloadFactory::CreateConvolution2d( const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) const { return MakeWorkload(descriptor, info, - m_MemoryManager.Get()); + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr NeonWorkloadFactory::CreateDepthwiseConvolution2d( @@ -132,7 +140,8 @@ std::unique_ptr NeonWorkloadFactory::CreateDepthwiseConvolution2d( std::unique_ptr NeonWorkloadFactory::CreateNormalization( const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info, m_MemoryManager.Get()); + return MakeWorkload(descriptor, info, + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr NeonWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor, @@ -161,21 +170,7 @@ std::unique_ptr NeonWorkloadFactory::CreateMemCopy(const MemCo throw InvalidArgumentException("NeonWorkloadFactory: Invalid null input for MemCopy workload"); } - // Create a workload that will copy tensor data from the inputs, which can have a number of different formats, - // to Neon tensors. - switch (descriptor.m_Inputs[0]->GetType()) - { - case ITensorHandle::Cpu: - return MakeWorkload(descriptor, info); -#if ARMCOMPUTECL_ENABLED - case ITensorHandle::CL: - { - return MakeWorkload(descriptor, info); - } -#endif - default: - throw InvalidArgumentException("NeonWorkloadFactory: Destination type not supported for MemCopy Workload."); - } + return MakeWorkload(descriptor, info); } std::unique_ptr NeonWorkloadFactory::CreateResizeBilinear( @@ -195,7 +190,8 @@ std::unique_ptr NeonWorkloadFactory::CreateFakeQuantization( std::unique_ptr NeonWorkloadFactory::CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload(descriptor, info, m_MemoryManager.Get()); + return MakeWorkload(descriptor, info, + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr NeonWorkloadFactory::CreateConstant(const ConstantQueueDescriptor& descriptor, @@ -216,11 +212,41 @@ std::unique_ptr NeonWorkloadFactory::CreateFloor(const FloorQueueDesc return MakeWorkload(descriptor, info); } +std::unique_ptr NeonWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return MakeWorkload(descriptor, info); +} + +std::unique_ptr NeonWorkloadFactory::CreateConvertFp16ToFp32( + const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique(descriptor, info); +} + +std::unique_ptr NeonWorkloadFactory::CreateConvertFp32ToFp16( + const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique(descriptor, info); +} + void NeonWorkloadFactory::Finalize() { m_MemoryManager.Finalize(); } +void NeonWorkloadFactory::Release() +{ + m_MemoryManager.Release(); +} + +void NeonWorkloadFactory::Acquire() +{ + m_MemoryManager.Acquire(); +} + #else // Compiled without ArmCompute libs NeonWorkloadFactory::NeonWorkloadFactory() @@ -371,9 +397,35 @@ std::unique_ptr NeonWorkloadFactory::CreateFloor(const FloorQueueDesc return nullptr; } +std::unique_ptr NeonWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + +std::unique_ptr NeonWorkloadFactory::CreateConvertFp16ToFp32( + const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + +std::unique_ptr NeonWorkloadFactory::CreateConvertFp32ToFp16( + const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + void NeonWorkloadFactory::Finalize() {} +void NeonWorkloadFactory::Release() +{} + +void NeonWorkloadFactory::Acquire() +{} + #endif } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloadFactory.hpp b/src/armnn/backends/NeonWorkloadFactory.hpp index 66a69f3baf..83e1f5e75f 100644 --- a/src/armnn/backends/NeonWorkloadFactory.hpp +++ b/src/armnn/backends/NeonWorkloadFactory.hpp @@ -4,15 +4,17 @@ // #pragma once -#include "AclBaseMemoryManager.hpp" #include "OutputHandler.hpp" +#include "memory/BaseMemoryManager.hpp" + #include +#include namespace armnn { -// Neon workload factory +// Neon workload factory. class NeonWorkloadFactory : public IWorkloadFactory { public: @@ -20,7 +22,8 @@ public: virtual Compute GetCompute() const override { return Compute::CpuAcc; } - static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported); + static bool IsLayerSupported(const Layer& layer, boost::optional dataType, + std::string& outReasonIfUnsupported); virtual bool SupportsSubTensors() const override { return true; } @@ -96,11 +99,25 @@ public: virtual std::unique_ptr CreateFloor(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) const override; - void Finalize() override; + virtual std::unique_ptr CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const override; -private: + virtual std::unique_ptr CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual std::unique_ptr CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual void Finalize() override; - mutable AclBaseMemoryManager m_MemoryManager; + virtual void Release() override; + + virtual void Acquire() override; + +private: +#ifdef ARMCOMPUTENEON_ENABLED + mutable NeonMemoryManager m_MemoryManager; +#endif }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloadUtils.cpp b/src/armnn/backends/NeonWorkloadUtils.cpp index e807d23d6c..07e5d510eb 100644 --- a/src/armnn/backends/NeonWorkloadUtils.cpp +++ b/src/armnn/backends/NeonWorkloadUtils.cpp @@ -20,13 +20,14 @@ #include "NeonLayerSupport.hpp" #include "../../../include/armnn/Types.hpp" +#include "Half.hpp" using namespace armnn::armcomputetensorutils; namespace armnn { -// Allocate a tensor and copy the contents in data to the tensor contents +// Allocates a tensor and copy the contents in data to the tensor contents. template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data) { @@ -34,8 +35,26 @@ void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data) CopyArmComputeITensorData(data, tensor); } +template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const Half* data); template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const float* data); template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const uint8_t* data); template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const int32_t* data); +void InitializeArmComputeTensorDataForFloatTypes(arm_compute::Tensor& tensor, + const ConstCpuTensorHandle* handle) +{ + BOOST_ASSERT(handle); + switch(handle->GetTensorInfo().GetDataType()) + { + case DataType::Float16: + InitialiseArmComputeTensorData(tensor, handle->GetConstTensor()); + break; + case DataType::Float32: + InitialiseArmComputeTensorData(tensor, handle->GetConstTensor()); + break; + default: + BOOST_ASSERT_MSG(false, "Unexpected floating point type."); + } +}; + } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloadUtils.hpp b/src/armnn/backends/NeonWorkloadUtils.hpp index ec7688237a..8169f8636a 100644 --- a/src/armnn/backends/NeonWorkloadUtils.hpp +++ b/src/armnn/backends/NeonWorkloadUtils.hpp @@ -7,6 +7,7 @@ #include "Workload.hpp" #include "backends/NeonTensorHandle.hpp" +#include "NeonTimer.hpp" #include "arm_compute/core/Types.h" #include "arm_compute/core/Helpers.h" @@ -22,4 +23,12 @@ class Layer; template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data); +void InitializeArmComputeTensorDataForFloatTypes(arm_compute::Tensor& tensor, const ConstCpuTensorHandle* handle); } //namespace armnn + + +#define ARMNN_SCOPED_PROFILING_EVENT_NEON(name) \ + ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::CpuAcc, \ + name, \ + armnn::WallClockTimer(), \ + armnn::NeonTimer()) diff --git a/src/armnn/backends/NeonWorkloads.hpp b/src/armnn/backends/NeonWorkloads.hpp index 83a3e9fd9b..9619b4e5c9 100644 --- a/src/armnn/backends/NeonWorkloads.hpp +++ b/src/armnn/backends/NeonWorkloads.hpp @@ -13,6 +13,8 @@ #include "backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonConstantFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonConstantUint8Workload.hpp" +#include "backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp" +#include "backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp" #include "backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp" #include "backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonConvolution2dUint8Workload.hpp" @@ -21,6 +23,7 @@ #include "backends/NeonWorkloads/NeonFloorFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp" +#include "backends/NeonWorkloads/NeonLstmFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonMergerFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonMergerUint8Workload.hpp" #include "backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp" diff --git a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp index 39e55d5761..711bfceeaf 100644 --- a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp @@ -9,9 +9,32 @@ namespace armnn { + +arm_compute::Status NeonActivationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const ActivationDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + const arm_compute::ActivationLayerInfo activationLayerInfo = + ConvertActivationDescriptorToAclActivationLayerInfo(descriptor); + + if (input.GetDataType() == DataType::QuantisedAsymm8 && + activationLayerInfo.activation() == arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR, + "Neon: Logistic Activations unsupported with QAsymm8 data type."}; + } + + return arm_compute::NEActivationLayer::validate(&aclInput, + &aclOutput, + activationLayerInfo); +} + NeonActivationFloat32Workload::NeonActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("NeonActivationFloat32Workload", 1, 1); @@ -26,7 +49,7 @@ NeonActivationFloat32Workload::NeonActivationFloat32Workload(const ActivationQue void NeonActivationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonActivationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonActivationFloat32Workload_Execute"); m_ActivationLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp index 6fa83ea2f6..0d26b3b39f 100644 --- a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp @@ -9,7 +9,12 @@ namespace armnn { -class NeonActivationFloat32Workload : public Float32Workload + +arm_compute::Status NeonActivationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const ActivationDescriptor& descriptor); + +class NeonActivationFloat32Workload : public FloatWorkload { public: NeonActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp index 27c37e9425..f2e42338b2 100644 --- a/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp @@ -13,15 +13,8 @@ NeonActivationUint8Workload::NeonActivationUint8Workload(const ActivationQueueDe const WorkloadInfo& info) : Uint8Workload(descriptor, info) { - - std::string reasonIfUnsupported; - if (!IsNeonActivationUint8Supported(&reasonIfUnsupported, m_Data.m_Parameters)) - { - throw InvalidArgumentException(reasonIfUnsupported); - } - - // Only BoundedReLu is supported (see IsNeonActivationUint8Supported) - arm_compute::ActivationLayerInfo layerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + auto activation = ConvertActivationFunctionToAclActivationFunction(m_Data.m_Parameters.m_Function); + arm_compute::ActivationLayerInfo layerInfo(activation, m_Data.m_Parameters.m_A, m_Data.m_Parameters.m_B); @@ -35,7 +28,7 @@ NeonActivationUint8Workload::NeonActivationUint8Workload(const ActivationQueueDe void NeonActivationUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonActivationUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonActivationUint8Workload_Execute"); m_ActivationLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp index d1fb64093d..f26e42aff9 100644 --- a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp @@ -4,14 +4,30 @@ // #include "NeonAdditionFloat32Workload.hpp" +#include "backends/ArmComputeTensorUtils.hpp" #include "backends/CpuTensorHandle.hpp" namespace armnn { +arm_compute::Status NeonAdditionWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput0 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); + const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + return arm_compute::NEArithmeticAddition::validate(&aclInput0, + &aclInput1, + &aclOutput, + arm_compute::ConvertPolicy::SATURATE); +} + + NeonAdditionFloat32Workload::NeonAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("NeonAdditionFloat32Workload", 2, 1); @@ -24,7 +40,7 @@ NeonAdditionFloat32Workload::NeonAdditionFloat32Workload(const AdditionQueueDesc void NeonAdditionFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonAdditionFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonAdditionFloat32Workload_Execute"); m_AddLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp index 5b75b502a3..dae66bb69d 100644 --- a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp @@ -9,7 +9,12 @@ namespace armnn { -class NeonAdditionFloat32Workload : public Float32Workload + +arm_compute::Status NeonAdditionWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output); + +class NeonAdditionFloat32Workload : public FloatWorkload { public: NeonAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp index 247ebfc5dd..e0ad408424 100644 --- a/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp @@ -5,23 +5,27 @@ #pragma once +#include #include #include #include +#include #include +#include #include +#include "Half.hpp" namespace armnn { -// Base class template providing an implementation of the Constant layer common to all data types -template -class NeonBaseConstantWorkload : public TypedWorkload +// Base class template providing an implementation of the Constant layer common to all data types. +template +class NeonBaseConstantWorkload : public TypedWorkload { public: NeonBaseConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info) - : TypedWorkload(descriptor, info) + : TypedWorkload(descriptor, info) , m_RanOnce(false) { } @@ -41,15 +45,22 @@ public: BOOST_ASSERT(data.m_LayerOutput != nullptr); arm_compute::ITensor& output = boost::polymorphic_downcast(data.m_Outputs[0])->GetTensor(); + arm_compute::DataType computeDataType = + boost::polymorphic_downcast(data.m_Outputs[0])->GetDataType(); - switch (DataFormat) + switch (computeDataType) { - case DataType::Float32: + case arm_compute::DataType::F16: + { + CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor(), output); + break; + } + case arm_compute::DataType::F32: { CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor(), output); break; } - case DataType::QuantisedAsymm8: + case arm_compute::DataType::QASYMM8: { CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor(), output); break; diff --git a/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp index 24640c7adb..6a87d62320 100644 --- a/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp @@ -5,20 +5,21 @@ #pragma once +#include #include namespace armnn { -// Base class template providing an implementation of the Merger layer common to all data types -template -class NeonBaseMergerWorkload : public TypedWorkload +// Base class template providing an implementation of the Merger layer common to all data types. +template +class NeonBaseMergerWorkload : public TypedWorkload { public: - using TypedWorkload::TypedWorkload; + using TypedWorkload::TypedWorkload; virtual void Execute() const override { - // With subtensors, merger is a no-op + // With subtensors, merger is a no-op. } }; diff --git a/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp index 769905b48b..769291c700 100644 --- a/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp @@ -6,20 +6,21 @@ #pragma once #include +#include namespace armnn { -// Base class template providing an implementation of the Splitter layer common to all data types -template -class NeonBaseSplitterWorkload : public TypedWorkload +// Base class template providing an implementation of the Splitter layer common to all data types. +template +class NeonBaseSplitterWorkload : public TypedWorkload { public: - using TypedWorkload::TypedWorkload; + using TypedWorkload::TypedWorkload; virtual void Execute() const override { - // With subtensors, splitter is a no-op + // With subtensors, splitter is a no-op. } }; diff --git a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp index f107c8137f..ca5c8202cd 100644 --- a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp @@ -6,40 +6,91 @@ #include "NeonBatchNormalizationFloat32Workload.hpp" #include "backends/CpuTensorHandle.hpp" #include "backends/ArmComputeTensorUtils.hpp" +#include "../../../../include/armnn/ArmNN.hpp" namespace armnn { using namespace armcomputetensorutils; + +arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, + const BatchNormalizationDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclMeanInfo = BuildArmComputeTensorInfo(mean); + const arm_compute::TensorInfo aclVarInfo = BuildArmComputeTensorInfo(var); + const arm_compute::TensorInfo aclBetaInfo = BuildArmComputeTensorInfo(beta); + const arm_compute::TensorInfo aclGammaInfo = BuildArmComputeTensorInfo(gamma); + + return arm_compute::NEBatchNormalizationLayer::validate(&aclInputInfo, + &aclOutputInfo, + &aclMeanInfo, + &aclVarInfo, + &aclBetaInfo, + &aclGammaInfo, + descriptor.m_Eps); +} + NeonBatchNormalizationFloat32Workload::NeonBatchNormalizationFloat32Workload( const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("NeonBatchNormalizationFloat32Workload", 1, 1); arm_compute::ITensor& input = boost::polymorphic_downcast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ITensor& output = boost::polymorphic_downcast(m_Data.m_Outputs[0])->GetTensor(); - BuildArmComputeTensor(m_Mean, m_Data.m_Mean->GetTensorInfo()); - BuildArmComputeTensor(m_Variance, m_Data.m_Variance->GetTensorInfo()); - BuildArmComputeTensor(m_Gamma, m_Data.m_Gamma->GetTensorInfo()); - BuildArmComputeTensor(m_Beta, m_Data.m_Beta->GetTensorInfo()); + m_Mean = std::make_unique(); + BuildArmComputeTensor(*m_Mean, m_Data.m_Mean->GetTensorInfo()); + + m_Variance = std::make_unique(); + BuildArmComputeTensor(*m_Variance, m_Data.m_Variance->GetTensorInfo()); - m_Layer.configure( - &input, &output, &m_Mean, &m_Variance, &m_Beta, &m_Gamma, m_Data.m_Parameters.m_Eps); + m_Gamma = std::make_unique(); + BuildArmComputeTensor(*m_Gamma, m_Data.m_Gamma->GetTensorInfo()); - InitialiseArmComputeTensorData(m_Mean, m_Data.m_Mean->GetConstTensor()); - InitialiseArmComputeTensorData(m_Variance, m_Data.m_Variance->GetConstTensor()); - InitialiseArmComputeTensorData(m_Gamma, m_Data.m_Gamma->GetConstTensor()); - InitialiseArmComputeTensorData(m_Beta, m_Data.m_Beta->GetConstTensor()); + m_Beta = std::make_unique(); + BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo()); + + m_Layer.configure(&input, + &output, + m_Mean.get(), + m_Variance.get(), + m_Beta.get(), + m_Gamma.get(), + m_Data.m_Parameters.m_Eps); + + InitializeArmComputeTensorDataForFloatTypes(*m_Mean, m_Data.m_Mean); + InitializeArmComputeTensorDataForFloatTypes(*m_Variance, m_Data.m_Variance); + InitializeArmComputeTensorDataForFloatTypes(*m_Gamma, m_Data.m_Gamma); + InitializeArmComputeTensorDataForFloatTypes(*m_Beta, m_Data.m_Beta); + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_Layer.prepare(); + FreeUnusedTensors(); } void NeonBatchNormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonBatchNormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonBatchNormalizationFloat32Workload_Execute"); m_Layer.run(); } +void NeonBatchNormalizationFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_Mean); + FreeTensorIfUnused(m_Variance); + FreeTensorIfUnused(m_Gamma); + FreeTensorIfUnused(m_Beta); +} + } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp index 2050d42859..5eb5601f26 100644 --- a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp @@ -10,7 +10,15 @@ namespace armnn { -class NeonBatchNormalizationFloat32Workload : public Float32Workload +arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, + const BatchNormalizationDescriptor& descriptor); + +class NeonBatchNormalizationFloat32Workload : public FloatWorkload { public: NeonBatchNormalizationFloat32Workload(const BatchNormalizationQueueDescriptor& descriptor, @@ -20,10 +28,12 @@ public: private: mutable arm_compute::NEBatchNormalizationLayer m_Layer; - arm_compute::Tensor m_Mean; - arm_compute::Tensor m_Variance; - arm_compute::Tensor m_Gamma; - arm_compute::Tensor m_Beta; + std::unique_ptr m_Mean; + std::unique_ptr m_Variance; + std::unique_ptr m_Gamma; + std::unique_ptr m_Beta; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp index 8b203fbf3a..4e5d570a8e 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonConstantFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConstantFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConstantFloat32Workload_Execute"); NeonBaseConstantWorkload::Execute(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp index 4ea4dfe127..050954df24 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class NeonConstantFloat32Workload : public NeonBaseConstantWorkload +class NeonConstantFloat32Workload : public NeonBaseConstantWorkload { public: - using NeonBaseConstantWorkload::NeonBaseConstantWorkload; + using NeonBaseConstantWorkload::NeonBaseConstantWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp index f6dfaeb7a7..4061605bc1 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonConstantUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConstantUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConstantUint8Workload_Execute"); NeonBaseConstantWorkload::Execute(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp new file mode 100644 index 0000000000..84fc051f65 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp @@ -0,0 +1,41 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonConvertFp16ToFp32Workload.hpp" +#include "Half.hpp" +#include "FloatingPointConverter.hpp" + +#include "backends/WorkloadUtils.hpp" + +namespace armnn +{ + +NeonConvertFp16ToFp32Workload::NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) + : Float16ToFloat32Workload(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("NeonConvertFp16ToFp32Workload", 1, 1); + GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); +} + +void NeonConvertFp16ToFp32Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp16ToFp32Workload_Execute"); + + auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size) + { + auto input = reinterpret_cast(src); + auto output = reinterpret_cast(dst); + size_t numElements = size/2; // 2 bytes per fp16 + armnnUtils::FloatingPointConverter::ConvertFloat16To32(input, numElements, output); + }; + + for (const auto& pair : m_TensorHandlePairs) + { + CopyTensorContentsGeneric(pair.first, pair.second, convertFunc); + } +} + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp new file mode 100644 index 0000000000..136c0d8a76 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp @@ -0,0 +1,26 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" +#include "backends/NeonWorkloadUtils.hpp" + +namespace armnn +{ + +class NeonConvertFp16ToFp32Workload : public Float16ToFloat32Workload +{ +public: + NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + using TensorHandlePair = std::pair; + std::vector m_TensorHandlePairs; +}; + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp new file mode 100644 index 0000000000..61f30522a8 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp @@ -0,0 +1,43 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonConvertFp32ToFp16Workload.hpp" + +#include "Half.hpp" +#include "FloatingPointConverter.hpp" + +#include "Profiling.hpp" +#include "backends/WorkloadUtils.hpp" + +namespace armnn +{ + +NeonConvertFp32ToFp16Workload::NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) + : Float32ToFloat16Workload(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("NeonConvertFp32ToFp16Workload", 1, 1); + GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); +} + +void NeonConvertFp32ToFp16Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp32ToFp16Workload_Execute"); + + auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size) + { + auto input = reinterpret_cast(src); + auto output = reinterpret_cast(dst); + size_t numElements = size/2; // 2 bytes per fp16 + armnnUtils::FloatingPointConverter::ConvertFloat32To16(input, numElements, output); + }; + + for (const auto& pair : m_TensorHandlePairs) + { + CopyTensorContentsGeneric(pair.first, pair.second, convertFunc); + } +} + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp new file mode 100644 index 0000000000..f48c365c48 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp @@ -0,0 +1,26 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" +#include "backends/NeonWorkloadUtils.hpp" + +namespace armnn +{ + +class NeonConvertFp32ToFp16Workload : public Float32ToFloat16Workload +{ +public: + NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + using TensorHandlePair = std::pair; + std::vector m_TensorHandlePairs; +}; + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp index 423f02bcb0..e76afb6cf7 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp @@ -9,6 +9,9 @@ #include "NeonConvolution2dBaseWorkload.hpp" +#include "armnn/Types.hpp" +#include "Half.hpp" + namespace armnn { @@ -41,28 +44,28 @@ arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input, layerInfo); } -template -NeonConvolution2dBaseWorkload::NeonConvolution2dBaseWorkload(const Convolution2dQueueDescriptor& descriptor, - const WorkloadInfo& info, std::shared_ptr& memoryManager) - : TypedWorkload(descriptor, info) +template +NeonConvolution2dBaseWorkload::NeonConvolution2dBaseWorkload( + const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, + std::shared_ptr& memoryManager) + : TypedWorkload(descriptor, info) { using arm_compute::NEDirectConvolutionLayer; - using namespace armcomputetensorutils; ValidateData(); - // todo: check tensor shapes match + // todo: check tensor shapes match. arm_compute::ITensor& input = boost::polymorphic_downcast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ITensor& output = boost::polymorphic_downcast(m_Data.m_Outputs[0])->GetTensor(); - BuildArmComputeTensor(m_KernelTensor, m_Data.m_Weight->GetTensorInfo()); + m_KernelTensor = std::make_unique(); + BuildArmComputeTensor(*m_KernelTensor, m_Data.m_Weight->GetTensorInfo()); - arm_compute::Tensor* optionalBiasTensor = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBiasTensor = &m_BiasTensor; + m_BiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, @@ -81,8 +84,8 @@ NeonConvolution2dBaseWorkload::NeonConvolution2dBaseWorkload(const Con { auto directConvolutionLayer = std::make_unique(memoryManager); directConvolutionLayer->configure(&input, - &m_KernelTensor, - optionalBiasTensor, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); m_ConvolutionLayer.reset(directConvolutionLayer.release()); @@ -91,22 +94,50 @@ NeonConvolution2dBaseWorkload::NeonConvolution2dBaseWorkload(const Con { auto convolutionLayer = std::make_unique(memoryManager); convolutionLayer->configure(&input, - &m_KernelTensor, - optionalBiasTensor, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); m_ConvolutionLayer.reset(convolutionLayer.release()); } BOOST_ASSERT(m_ConvolutionLayer); - using Type = ResolveType; + armnn::DataType dataType = m_Data.m_Weight->GetTensorInfo().GetDataType(); + + switch (dataType) + { + case DataType::Float16: + { + InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor()); + break; + } + case DataType::Float32: + { + InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor()); + break; + } + case DataType::QuantisedAsymm8: + { + InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor()); + break; + } + default: + { + BOOST_ASSERT_MSG(false, "Unknown DataType."); + } + } +} - InitialiseArmComputeTensorData(m_KernelTensor, m_Data.m_Weight->template GetConstTensor()); +template +void NeonConvolution2dBaseWorkload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); } -// Generate known implementations for linker -template class NeonConvolution2dBaseWorkload; -template class NeonConvolution2dBaseWorkload; +// Generates known implementations for linker. +template class NeonConvolution2dBaseWorkload; +template class NeonConvolution2dBaseWorkload; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp index d28d50d819..524d2c90b6 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp @@ -25,11 +25,11 @@ arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input, const TensorInfo& weights, const TensorInfo& biases); -template -class NeonConvolution2dBaseWorkload : public TypedWorkload +template +class NeonConvolution2dBaseWorkload : public TypedWorkload { public: - using TypedWorkload::m_Data; + using TypedWorkload::m_Data; NeonConvolution2dBaseWorkload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr& memoryManager); @@ -38,8 +38,11 @@ public: protected: std::unique_ptr m_ConvolutionLayer; - arm_compute::Tensor m_KernelTensor; - arm_compute::Tensor m_BiasTensor; + + std::unique_ptr m_KernelTensor; + std::unique_ptr m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp index f20f2a4ac5..18ec6ca2e7 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp @@ -18,13 +18,16 @@ NeonConvolution2dFloat32Workload::NeonConvolution2dFloat32Workload(const Convolu { if (m_Data.m_Parameters.m_BiasEnabled) { - InitialiseArmComputeTensorData(m_BiasTensor, m_Data.m_Bias->template GetConstTensor()); + InitializeArmComputeTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias); } + + m_ConvolutionLayer->prepare(); + FreeUnusedTensors(); } void NeonConvolution2dFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConvolution2dFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvolution2dFloat32Workload_Execute"); m_ConvolutionLayer->run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp index 56b0848efa..0bb8d69d94 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp @@ -15,7 +15,7 @@ namespace armnn { -class NeonConvolution2dFloat32Workload : public NeonConvolution2dBaseWorkload +class NeonConvolution2dFloat32Workload : public NeonConvolution2dBaseWorkload { public: NeonConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp index fb91f7b7b2..bb33e939ea 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp @@ -14,14 +14,16 @@ NeonConvolution2dUint8Workload::NeonConvolution2dUint8Workload(const Convolution { if (m_Data.m_Parameters.m_BiasEnabled) { - InitialiseArmComputeTensorData(m_BiasTensor, m_Data.m_Bias->template GetConstTensor()); + InitialiseArmComputeTensorData(*m_BiasTensor, m_Data.m_Bias->template GetConstTensor()); } -} + m_ConvolutionLayer->prepare(); + FreeUnusedTensors(); +} void NeonConvolution2dUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConvolution2dUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvolution2dUint8Workload_Execute"); m_ConvolutionLayer->run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp new file mode 100644 index 0000000000..58d6061537 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp @@ -0,0 +1,46 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonDepthwiseConvolutionBaseWorkload.hpp" + +#include "backends/ArmComputeTensorUtils.hpp" + +namespace armnn +{ + +arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const TensorInfo& biases) +{ + const arm_compute::TensorInfo aclInputInfo = + armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = + armcomputetensorutils::BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclWeightsInfo = + armcomputetensorutils::BuildArmComputeTensorInfo(weights); + + arm_compute::TensorInfo aclBiasesInfo; + arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr; + if (descriptor.m_BiasEnabled) + { + aclBiasesInfo = armcomputetensorutils::BuildArmComputeTensorInfo(biases); + optionalAclBiasesInfo = &aclBiasesInfo; + } + + const arm_compute::PadStrideInfo aclPadStrideInfo = + armcomputetensorutils::BuildArmComputePadStrideInfo(descriptor); + const unsigned int aclDepthMultiplier = weights.GetShape()[0]; + + return arm_compute::NEDepthwiseConvolutionLayer::validate(&aclInputInfo, + &aclWeightsInfo, + optionalAclBiasesInfo, + &aclOutputInfo, + aclPadStrideInfo, + aclDepthMultiplier); +} + +} diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp new file mode 100644 index 0000000000..0cead354f8 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp @@ -0,0 +1,19 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/NeonWorkloadUtils.hpp" + +namespace armnn +{ + +arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const TensorInfo& biases); + +} // namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp index 11e31c727a..f94cd903b6 100644 --- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp @@ -16,23 +16,17 @@ using namespace armcomputetensorutils; NeonDepthwiseConvolutionFloat32Workload::NeonDepthwiseConvolutionFloat32Workload( const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); - std::string reasonIfUnsupported; - if (!IsNeonDepthwiseConvolution2dDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters, weightInfo)) - { - throw UnimplementedException(reasonIfUnsupported); - } + m_KernelTensor = std::make_unique(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); - BuildArmComputeTensor(m_KernelTensor, weightInfo); - - arm_compute::Tensor* optionalBias = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; + m_BiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, @@ -54,8 +48,8 @@ NeonDepthwiseConvolutionFloat32Workload::NeonDepthwiseConvolutionFloat32Workload m_pDepthwiseConvolutionLayer = std::make_unique(); static_cast( m_pDepthwiseConvolutionLayer.get())->configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); } @@ -64,28 +58,37 @@ NeonDepthwiseConvolutionFloat32Workload::NeonDepthwiseConvolutionFloat32Workload m_pDepthwiseConvolutionLayer = std::make_unique(); static_cast( m_pDepthwiseConvolutionLayer.get())->configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); } BOOST_ASSERT(m_pDepthwiseConvolutionLayer); - InitialiseArmComputeTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor()); + InitializeArmComputeTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight); - if (optionalBias) + if (m_BiasTensor) { - InitialiseArmComputeTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor()); + InitializeArmComputeTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias); } + + m_pDepthwiseConvolutionLayer->prepare(); + FreeUnusedTensors(); } void NeonDepthwiseConvolutionFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "NeonDepthwiseConvolutionFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDepthwiseConvolutionFloat32Workload_Execute"); BOOST_ASSERT(m_pDepthwiseConvolutionLayer); m_pDepthwiseConvolutionLayer->run(); } +void NeonDepthwiseConvolutionFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp index f9e295f568..ece9f1877b 100644 --- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class NeonDepthwiseConvolutionFloat32Workload : public Float32Workload +class NeonDepthwiseConvolutionFloat32Workload : public FloatWorkload { public: NeonDepthwiseConvolutionFloat32Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, @@ -20,8 +20,10 @@ public: private: mutable std::unique_ptr m_pDepthwiseConvolutionLayer; - arm_compute::Tensor m_KernelTensor; - arm_compute::Tensor m_BiasTensor; + std::unique_ptr m_KernelTensor; + std::unique_ptr m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp index bd034c4f80..45fbcb37ab 100644 --- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp @@ -20,19 +20,13 @@ NeonDepthwiseConvolutionUint8Workload::NeonDepthwiseConvolutionUint8Workload( { const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); - std::string reasonIfUnsupported; - if (!IsNeonDepthwiseConvolution2dDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters, weightInfo)) - { - throw UnimplementedException(reasonIfUnsupported); - } + m_KernelTensor = std::make_unique(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); - BuildArmComputeTensor(m_KernelTensor, weightInfo); - - arm_compute::Tensor* optionalBias = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; + m_BiasTensor = std::make_unique(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, @@ -54,8 +48,8 @@ NeonDepthwiseConvolutionUint8Workload::NeonDepthwiseConvolutionUint8Workload( m_pDepthwiseConvolutionLayer = std::make_unique(); static_cast( m_pDepthwiseConvolutionLayer.get())->configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); } @@ -64,28 +58,37 @@ NeonDepthwiseConvolutionUint8Workload::NeonDepthwiseConvolutionUint8Workload( m_pDepthwiseConvolutionLayer = std::make_unique(); static_cast( m_pDepthwiseConvolutionLayer.get())->configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); } BOOST_ASSERT(m_pDepthwiseConvolutionLayer); - InitialiseArmComputeTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor()); + InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->GetConstTensor()); - if (optionalBias) + if (m_BiasTensor) { - InitialiseArmComputeTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor()); + InitialiseArmComputeTensorData(*m_BiasTensor, m_Data.m_Bias->GetConstTensor()); } + + m_pDepthwiseConvolutionLayer->prepare(); + FreeUnusedTensors(); } void NeonDepthwiseConvolutionUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "NeonDepthwiseConvolutionUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDepthwiseConvolutionUint8Workload_Execute"); BOOST_ASSERT(m_pDepthwiseConvolutionLayer); m_pDepthwiseConvolutionLayer->run(); } +void NeonDepthwiseConvolutionUint8Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp index 9cf272e9f5..aca0ba5337 100644 --- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp @@ -20,8 +20,10 @@ public: private: mutable std::unique_ptr m_pDepthwiseConvolutionLayer; - arm_compute::Tensor m_KernelTensor; - arm_compute::Tensor m_BiasTensor; + std::unique_ptr m_KernelTensor; + std::unique_ptr m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp index a5eec5cadb..c43cfa9c46 100644 --- a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp @@ -9,7 +9,7 @@ namespace armnn { NeonFloorFloat32Workload::NeonFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("NeonFloorFloat32Workload", 1, 1); @@ -21,7 +21,7 @@ NeonFloorFloat32Workload::NeonFloorFloat32Workload(const FloorQueueDescriptor& d void NeonFloorFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonFloorFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFloorFloat32Workload_Execute"); m_Layer.run(); } } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp index f876f1e1bb..56680f1e39 100644 --- a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class NeonFloorFloat32Workload : public Float32Workload +class NeonFloorFloat32Workload : public FloatWorkload { public: NeonFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp index e1c4448642..c3af41e20d 100644 --- a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp @@ -4,16 +4,47 @@ // #include "NeonFullyConnectedFloat32Workload.hpp" -#include "backends/CpuTensorHandle.hpp" + #include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ArmComputeUtils.hpp" +#include "backends/CpuTensorHandle.hpp" namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclWeights = BuildArmComputeTensorInfo(weights); + + arm_compute::TensorInfo aclBiases; + arm_compute::TensorInfo *optionalAclBiases = nullptr; + if (descriptor.m_BiasEnabled) + { + aclBiases = BuildArmComputeTensorInfo(biases); + optionalAclBiases = &aclBiases; + } + + const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo = + ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor); + + + return arm_compute::NEFullyConnectedLayer::validate(&aclInput, + &aclWeights, + optionalAclBiases, + &aclOutput, + fullyConnectedLayerInfo); +} + NeonFullyConnectedFloat32Workload::NeonFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr& memoryManager) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) , m_FullyConnectedLayer(memoryManager) { m_Data.ValidateInputsOutputs("NeonFullyConnectedFloat32Workload", 1, 1); @@ -21,33 +52,45 @@ NeonFullyConnectedFloat32Workload::NeonFullyConnectedFloat32Workload(const Fully arm_compute::ITensor& input = boost::polymorphic_downcast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ITensor& output = boost::polymorphic_downcast(m_Data.m_Outputs[0])->GetTensor(); - BuildArmComputeTensor(m_WeightsTensor, m_Data.m_Weight->GetTensorInfo()); + m_WeightsTensor = std::make_unique(); + BuildArmComputeTensor(*m_WeightsTensor, m_Data.m_Weight->GetTensorInfo()); - arm_compute::Tensor* optionalBiasTensor = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBiasTensor = &m_BiasesTensor; + m_BiasesTensor = std::make_unique(); + BuildArmComputeTensor(*m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); } // Construct - m_FullyConnectedLayer.configure( - &input, &m_WeightsTensor, optionalBiasTensor, &output, m_Data.m_Parameters.m_TransposeWeightMatrix); + arm_compute::FullyConnectedLayerInfo fc_info; + fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix; + m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info); // Allocate - InitialiseArmComputeTensorData(m_WeightsTensor, m_Data.m_Weight->GetConstTensor()); + InitializeArmComputeTensorDataForFloatTypes(*m_WeightsTensor, m_Data.m_Weight); - if (optionalBiasTensor) + if (m_BiasesTensor) { - InitialiseArmComputeTensorData(*optionalBiasTensor, m_Data.m_Bias->GetConstTensor()); + InitializeArmComputeTensorDataForFloatTypes(*m_BiasesTensor, m_Data.m_Bias); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_FullyConnectedLayer.prepare(); + FreeUnusedTensors(); } void NeonFullyConnectedFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonFullyConnectedFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFullyConnectedFloat32Workload_Execute"); m_FullyConnectedLayer.run(); } +void NeonFullyConnectedFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_WeightsTensor); + FreeTensorIfUnused(m_BiasesTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp index 9c722dc573..684b5e0753 100644 --- a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp @@ -14,7 +14,13 @@ namespace armnn { -class NeonFullyConnectedFloat32Workload : public Float32Workload +arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor); + +class NeonFullyConnectedFloat32Workload : public FloatWorkload { public: NeonFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info, @@ -23,8 +29,11 @@ public: private: mutable arm_compute::NEFullyConnectedLayer m_FullyConnectedLayer; - arm_compute::Tensor m_WeightsTensor; - arm_compute::Tensor m_BiasesTensor; + + std::unique_ptr m_WeightsTensor; + std::unique_ptr m_BiasesTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp index 9f79fa09de..a3ae33f41f 100644 --- a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp @@ -9,9 +9,21 @@ namespace armnn { +arm_compute::Status NeonL2NormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + arm_compute::NormalizationLayerInfo normalizationInfo = + CreateAclNormalizationLayerInfoForL2Normalization(input); + + return arm_compute::NENormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo); +} + NeonL2NormalizationFloat32Workload::NeonL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr& memoryManager) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) , m_Layer(memoryManager) { m_Data.ValidateInputsOutputs("NeonL2NormalizationFloat32Workload", 1, 1); @@ -23,7 +35,7 @@ NeonL2NormalizationFloat32Workload::NeonL2NormalizationFloat32Workload(const L2N void NeonL2NormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonL2NormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonL2NormalizationFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp index 2b4a1fef37..c3fcde5a57 100644 --- a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp @@ -14,7 +14,10 @@ namespace armnn { -class NeonL2NormalizationFloat32Workload : public Float32Workload +arm_compute::Status NeonL2NormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output); + +class NeonL2NormalizationFloat32Workload : public FloatWorkload { public: NeonL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp new file mode 100644 index 0000000000..ba1369e179 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp @@ -0,0 +1,22 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonLstmFloat32Workload.hpp" + +namespace armnn +{ +NeonLstmFloat32Workload::NeonLstmFloat32Workload(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) + : FloatWorkload(descriptor, info) +{ + m_Data.ValidateInputsOutputs("NeonLstmFloat32Workload", 1, 1); +} + +void NeonLstmFloat32Workload::Execute() const +{ + throw armnn::Exception("No implementation of Lstm in the Neon backend!"); +} + +} // namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp new file mode 100644 index 0000000000..78ee1da341 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp @@ -0,0 +1,20 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include + +namespace armnn +{ + +class NeonLstmFloat32Workload : public FloatWorkload +{ +public: + NeonLstmFloat32Workload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp index 7520e8768e..30dd283620 100644 --- a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonMergerFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "ClMergerFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMergerFloat32Workload_Execute"); NeonBaseMergerWorkload::Execute(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp index 5c889c2af0..7b8ee9881f 100644 --- a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class NeonMergerFloat32Workload : public NeonBaseMergerWorkload +class NeonMergerFloat32Workload : public NeonBaseMergerWorkload { public: - using NeonBaseMergerWorkload::NeonBaseMergerWorkload; + using NeonBaseMergerWorkload::NeonBaseMergerWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp index 51578e5bff..caccdd443a 100644 --- a/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonMergerUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "ClMergerUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMergerUint8Workload_Execute"); NeonBaseMergerWorkload::Execute(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp index 58ce7b74ba..a8a3cd77b4 100644 --- a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp @@ -9,9 +9,28 @@ namespace armnn { +arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); + const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it, + // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be + // ignored for F32 tensors. + return arm_compute::NEPixelWiseMultiplication::validate(&aclInput1, + &aclInput2, + &aclOutput, + 1.0f, + arm_compute::ConvertPolicy::SATURATE, + arm_compute::RoundingPolicy::TO_ZERO); +} + NeonMultiplicationFloat32Workload::NeonMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("NeonMultiplicationFloat32Workload", 2, 1); @@ -32,7 +51,7 @@ NeonMultiplicationFloat32Workload::NeonMultiplicationFloat32Workload(const Multi void NeonMultiplicationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonMultiplicationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMultiplicationFloat32Workload_Execute"); m_PixelWiseMultiplication.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp index ed5ead3700..62e84a2e07 100644 --- a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp @@ -9,8 +9,11 @@ namespace armnn { +arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output); -class NeonMultiplicationFloat32Workload : public Float32Workload +class NeonMultiplicationFloat32Workload : public FloatWorkload { public: NeonMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp index 0fd0dcc420..20936a2760 100644 --- a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp @@ -6,13 +6,28 @@ #include "NeonNormalizationFloat32Workload.hpp" #include "backends/NeonLayerSupport.hpp" #include "backends/ArmComputeUtils.hpp" +#include "backends/ArmComputeTensorUtils.hpp" namespace armnn { +arm_compute::Status NeonNormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const NormalizationDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + arm_compute::NormalizationLayerInfo normalizationInfo = + armcomputetensorutils::BuildArmComputeNormalizationLayerInfo(descriptor); + + return arm_compute::NENormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo); +} + NeonNormalizationFloat32Workload::NeonNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, - const WorkloadInfo& info, std::shared_ptr& memoryManager) - : Float32Workload(descriptor, info) + const WorkloadInfo& info, + std::shared_ptr& memoryManager) + : FloatWorkload(descriptor, info) , m_NormalizationLayer(memoryManager) { m_Data.ValidateInputsOutputs("NeonNormalizationFloat32Workload", 1, 1); @@ -22,7 +37,7 @@ NeonNormalizationFloat32Workload::NeonNormalizationFloat32Workload(const Normali throw UnimplementedException(reasonIfUnsupported); } - // input and output tensors have to have the same dimensionality + // Input and output tensors have to have the same dimensionality. if (info.m_InputTensorInfos[0].GetShape()[1] != info.m_OutputTensorInfos[0].GetShape()[1] || info.m_InputTensorInfos[0].GetShape()[0] != info.m_OutputTensorInfos[0].GetShape()[0] || info.m_InputTensorInfos[0].GetShape()[3] != info.m_OutputTensorInfos[0].GetShape()[3] @@ -48,7 +63,7 @@ NeonNormalizationFloat32Workload::NeonNormalizationFloat32Workload(const Normali void NeonNormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonNormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonNormalizationFloat32Workload_Execute"); m_NormalizationLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp index 24b6da8528..8f0823454b 100644 --- a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp @@ -12,7 +12,11 @@ namespace armnn { -class NeonNormalizationFloat32Workload : public Float32Workload +arm_compute::Status NeonNormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const NormalizationDescriptor& descriptor); + +class NeonNormalizationFloat32Workload : public FloatWorkload { public: NeonNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp index e0a0457422..c27797ee4e 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp @@ -24,10 +24,10 @@ arm_compute::Status NeonPermuteWorkloadValidate(const TensorInfo& input, armcomputetensorutils::BuildArmComputePermutationVector(mappings)); } -template -NeonPermuteWorkload::NeonPermuteWorkload(const PermuteQueueDescriptor& descriptor, +template +NeonPermuteWorkload::NeonPermuteWorkload(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info) - : TypedWorkload(descriptor, info) + : TypedWorkload(descriptor, info) { using armcomputetensorutils::BuildArmComputePermutationVector; @@ -37,18 +37,18 @@ NeonPermuteWorkload::NeonPermuteWorkload(const PermuteQueueDescriptor& arm_compute::ITensor& output = static_cast(m_Data.m_Outputs[0])->GetTensor(); const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings; - // Run the layer + // Run the layer. m_PermuteFunction.configure(&input, &output, BuildArmComputePermutationVector(mappings)); } -template -void NeonPermuteWorkload::Execute() const +template +void NeonPermuteWorkload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, GetName() + "_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON(GetName() + "_Execute"); m_PermuteFunction.run(); } -template class NeonPermuteWorkload; +template class NeonPermuteWorkload; template class NeonPermuteWorkload; } // namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp index 56e8719d6c..06b2dc692b 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp @@ -7,6 +7,7 @@ #include "backends/Workload.hpp" #include "backends/WorkloadData.hpp" +#include "backends/NeonWorkloadUtils.hpp" #include #include @@ -18,13 +19,13 @@ namespace armnn arm_compute::Status NeonPermuteWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const PermuteDescriptor& descriptor); -template -class NeonPermuteWorkload : public TypedWorkload +template +class NeonPermuteWorkload : public TypedWorkload { public: static const std::string& GetName() { - static const std::string name = std::string("NeonPermute") + GetDataTypeName(DataType) + "Workload"; + static const std::string name = std::string("NeonPermuteWorkload"); return name; } @@ -32,11 +33,11 @@ public: void Execute() const override; private: - using TypedWorkload::m_Data; + using TypedWorkload::m_Data; mutable arm_compute::NEPermute m_PermuteFunction; }; -using NeonPermuteFloat32Workload = NeonPermuteWorkload; +using NeonPermuteFloatWorkload = NeonPermuteWorkload; using NeonPermuteUint8Workload = NeonPermuteWorkload; -} //namespace armnn +} // namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp index 6d6a492155..3585d36ba3 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp @@ -25,10 +25,10 @@ arm_compute::Status NeonPooling2dWorkloadValidate(const TensorInfo& input, return arm_compute::NEPoolingLayer::validate(&aclInputInfo, &aclOutputInfo, layerInfo); } -template -NeonPooling2dBaseWorkload::NeonPooling2dBaseWorkload( +template +NeonPooling2dBaseWorkload::NeonPooling2dBaseWorkload( const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name) - : TypedWorkload(descriptor, info) + : TypedWorkload(descriptor, info) { m_Data.ValidateInputsOutputs(name, 1, 1); @@ -40,7 +40,7 @@ NeonPooling2dBaseWorkload::NeonPooling2dBaseWorkload( m_PoolingLayer.configure(&input, &output, layerInfo); } -template class NeonPooling2dBaseWorkload; +template class NeonPooling2dBaseWorkload; template class NeonPooling2dBaseWorkload; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp index 9461982f86..2e85e937fa 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp @@ -14,12 +14,12 @@ arm_compute::Status NeonPooling2dWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const Pooling2dDescriptor& descriptor); -// Base class template providing an implementation of the Pooling2d layer common to all data types -template -class NeonPooling2dBaseWorkload : public TypedWorkload +// Base class template providing an implementation of the Pooling2d layer common to all data types. +template +class NeonPooling2dBaseWorkload : public TypedWorkload { public: - using TypedWorkload::m_Data; + using TypedWorkload::m_Data; NeonPooling2dBaseWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name); diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp index ba2aa20924..cb690c51b8 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp @@ -12,13 +12,14 @@ namespace armnn NeonPooling2dFloat32Workload::NeonPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : NeonPooling2dBaseWorkload(descriptor, info, "NeonPooling2dFloat32Workload") + : NeonPooling2dBaseWorkload(descriptor, info, + "NeonPooling2dFloat32Workload") { } void NeonPooling2dFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonPooling2dFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPooling2dFloat32Workload_Execute"); m_PoolingLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp index 6cfc9cc96f..36c4e7edf1 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp @@ -11,7 +11,8 @@ namespace armnn { -class NeonPooling2dFloat32Workload : public NeonPooling2dBaseWorkload +class NeonPooling2dFloat32Workload : public NeonPooling2dBaseWorkload { public: NeonPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp index 0778794081..3e06d08dea 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp @@ -18,7 +18,7 @@ NeonPooling2dUint8Workload::NeonPooling2dUint8Workload(const Pooling2dQueueDescr void NeonPooling2dUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonPooling2dUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPooling2dUint8Workload_Execute"); m_PoolingLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp index 317d16f6bd..93f6eb8ef5 100644 --- a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp @@ -12,7 +12,7 @@ namespace armnn NeonReshapeFloat32Workload::NeonReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) { m_Data.ValidateInputsOutputs("NeonReshapeFloat32Workload", 1, 1); @@ -24,7 +24,7 @@ NeonReshapeFloat32Workload::NeonReshapeFloat32Workload(const ReshapeQueueDescrip void NeonReshapeFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonReshapeFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReshapeFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp index 27f4aea9e7..3e5cca1b9e 100644 --- a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class NeonReshapeFloat32Workload : public Float32Workload +class NeonReshapeFloat32Workload : public FloatWorkload { public: NeonReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp index 06f57c1e0f..b31bdcd3d0 100644 --- a/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp @@ -24,7 +24,7 @@ NeonReshapeUint8Workload::NeonReshapeUint8Workload(const ReshapeQueueDescriptor& void NeonReshapeUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonReshapeUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReshapeUint8Workload_Execute"); m_Layer.run(); } } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp new file mode 100644 index 0000000000..3efffafe25 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp @@ -0,0 +1,30 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonSoftmaxBaseWorkload.hpp" + +#include "backends/ArmComputeTensorUtils.hpp" + +namespace armnn +{ + +arm_compute::Status NeonSoftmaxWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const SoftmaxDescriptor& descriptor) +{ + // NOTE: We report 4D Softmax as unsupported until full support is added to ACL + if(input.GetShape().GetNumDimensions() >= 4u) + { + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, "4d softmax is not supported"); + } + + const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + return arm_compute::NESoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo, descriptor.m_Beta); +} + +} //namespace armnn + diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp new file mode 100644 index 0000000000..b9b21fb254 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp @@ -0,0 +1,17 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/NeonWorkloadUtils.hpp" + +namespace armnn +{ + +arm_compute::Status NeonSoftmaxWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const SoftmaxDescriptor& descriptor); + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp index 5e2925ca02..027b508ad5 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp @@ -10,12 +10,12 @@ namespace armnn NeonSoftmaxFloat32Workload::NeonSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr& memoryManager) - : Float32Workload(descriptor, info) + : FloatWorkload(descriptor, info) , m_SoftmaxLayer(memoryManager) { m_Data.ValidateInputsOutputs("NeonSoftmaxFloat32Workload", 1, 1); - // The ArmCompute softmax layer uses 2D input/output tensors, so flatten the first three dimensions + // The ArmCompute softmax layer uses 2D input/output tensors, so flatten the first three dimensions. arm_compute::ITensor& input = boost::polymorphic_downcast(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ITensor& output = boost::polymorphic_downcast(m_Data.m_Outputs[0])->GetTensor(); @@ -24,7 +24,7 @@ NeonSoftmaxFloat32Workload::NeonSoftmaxFloat32Workload(const SoftmaxQueueDescrip void NeonSoftmaxFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonSoftmaxFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxFloat32Workload_Execute"); m_SoftmaxLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp index 91d25b47f8..3656a26a3c 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp @@ -14,7 +14,7 @@ namespace armnn { -class NeonSoftmaxFloat32Workload : public Float32Workload +class NeonSoftmaxFloat32Workload : public FloatWorkload { public: NeonSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp index eb4a23c13c..4b0c05b25b 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp @@ -32,7 +32,7 @@ NeonSoftmaxUint8Workload::NeonSoftmaxUint8Workload(const SoftmaxQueueDescriptor& void NeonSoftmaxUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "ClSoftmaxUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxUint8Workload_Execute"); m_SoftmaxLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp index 13701d2ed3..996fc15adb 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonSplitterFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonSplitterFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSplitterFloat32Workload_Execute"); NeonBaseSplitterWorkload::Execute(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp index 432f5de4eb..9f6dc75499 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class NeonSplitterFloat32Workload : public NeonBaseSplitterWorkload +class NeonSplitterFloat32Workload : public NeonBaseSplitterWorkload { public: - using NeonBaseSplitterWorkload::NeonBaseSplitterWorkload; + using NeonBaseSplitterWorkload::NeonBaseSplitterWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp index 90d24d3ffd..0d6328ff7e 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonSplitterUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonSplitterUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSplitterUint8Workload_Execute"); NeonBaseSplitterWorkload::Execute(); } diff --git a/src/armnn/backends/OutputHandler.cpp b/src/armnn/backends/OutputHandler.cpp index 54afe565a9..ccc62c89ce 100644 --- a/src/armnn/backends/OutputHandler.cpp +++ b/src/armnn/backends/OutputHandler.cpp @@ -30,12 +30,4 @@ void OutputHandler::CollectWorkloadOutputs(WorkloadDataCollector& dataCollector) dataCollector.Push(m_TensorHandle.get(), m_TensorInfo); } -void OutputHandler::AllocateTensors() -{ - if (m_TensorHandle) - { - m_TensorHandle->Allocate(); - } -} - } // namespace armnn diff --git a/src/armnn/backends/OutputHandler.hpp b/src/armnn/backends/OutputHandler.hpp index 9cc87c6095..ed95577cca 100644 --- a/src/armnn/backends/OutputHandler.hpp +++ b/src/armnn/backends/OutputHandler.hpp @@ -31,30 +31,27 @@ class WorkloadDataCollector; class OutputHandler { public: - /// @brief Sets the TensorInfo used by this output handler. - /// @param tensorInfo TensorInfo for the output. + /// @brief - Sets the TensorInfo used by this output handler. + /// @param tensorInfo - TensorInfo for the output. void SetTensorInfo(const TensorInfo& tensorInfo); - /// @brief Create tensor handlers used by the intermediate tensors. Does not allocate memory. - /// @param factory Factory to be used for handler creation. + /// @brief - Creates tensor handlers used by the intermediate tensors. Does not allocate memory. + /// @param factory - Factory to be used for handler creation. void CreateTensorHandles(const IWorkloadFactory& factory); - /// @brief Get the matching TensorInfo for the output - /// @return Reference to the output TensorInfo. + /// @brief - Gets the matching TensorInfo for the output. + /// @return - References to the output TensorInfo. const TensorInfo& GetTensorInfo() const { return m_TensorInfo; } - /// @brief Get the allocated tensor memory. - /// @return Pointer to the tensor memory + /// @brief - Gets the allocated tensor memory. + /// @return - Pointer to the tensor memory. ITensorHandle* GetData() const { return m_TensorHandle.get(); } - /// Fill the outputs for a given queue descriptor + /// Fill the outputs for a given queue descriptor. void CollectWorkloadOutputs(WorkloadDataCollector& dataCollector) const; void SetData(std::unique_ptr data) { m_TensorHandle = std::move(data); } - /// @brief Allocate memory for all the tensors assigned to the handlers - void AllocateTensors(); - /// @brief Returns true if SetTensorInfo() has been called at least once on this. bool IsTensorInfoSet() const { return m_bTensorInfoSet; } private: diff --git a/src/armnn/backends/RefLayerSupport.cpp b/src/armnn/backends/RefLayerSupport.cpp index 0b94656ded..ca4fca6f31 100644 --- a/src/armnn/backends/RefLayerSupport.cpp +++ b/src/armnn/backends/RefLayerSupport.cpp @@ -10,7 +10,6 @@ #include #include - #include "InternalTypes.hpp" using namespace boost; @@ -27,15 +26,18 @@ bool IsSupportedForDataTypeRef(std::string* reasonIfUnsupported, { return IsSupportedForDataTypeGeneric(reasonIfUnsupported, dataType, + &FalseFunc, floatFuncPtr, uint8FuncPtr, std::forward(params)...); } bool IsActivationSupportedRef(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported) { + ignore_unused(output); ignore_unused(descriptor); return IsSupportedForDataTypeRef(reasonIfUnsupported, input.GetDataType(), @@ -57,6 +59,11 @@ bool IsAdditionSupportedRef(const TensorInfo& input0, } bool IsBatchNormalizationSupportedRef(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported) { @@ -94,12 +101,16 @@ bool IsConvolution2dSupportedRef(const TensorInfo& input, } bool IsDepthwiseConvolutionSupportedRef(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported) { + ignore_unused(output); ignore_unused(descriptor); ignore_unused(weights); + ignore_unused(biases); return IsSupportedForDataTypeRef(reasonIfUnsupported, input.GetDataType(), &TrueFunc<>, @@ -107,10 +118,16 @@ bool IsDepthwiseConvolutionSupportedRef(const TensorInfo& input, } bool IsFullyConnectedSupportedRef(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported) { + ignore_unused(output); ignore_unused(descriptor); + ignore_unused(weights); + ignore_unused(biases); return IsSupportedForDataTypeRef(reasonIfUnsupported, input.GetDataType(), &TrueFunc<>, @@ -127,8 +144,10 @@ bool IsInputSupportedRef(const TensorInfo& input, } bool IsL2NormalizationSupportedRef(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported) { + ignore_unused(output); return IsSupportedForDataTypeRef(reasonIfUnsupported, input.GetDataType(), &TrueFunc<>, @@ -148,9 +167,11 @@ bool IsMergerSupportedRef(const std::vector inputs, bool IsMultiplicationSupportedRef(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported) { ignore_unused(input1); + ignore_unused(output); return IsSupportedForDataTypeRef(reasonIfUnsupported, input0.GetDataType(), &TrueFunc<>, @@ -212,9 +233,11 @@ bool IsResizeBilinearSupportedRef(const TensorInfo& input, } bool IsSoftmaxSupportedRef(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported) { + ignore_unused(output); ignore_unused(descriptor); return IsSupportedForDataTypeRef(reasonIfUnsupported, input.GetDataType(), @@ -264,4 +287,78 @@ bool IsFloorSupportedRef(const TensorInfo& input, &FalseFuncU8<>); } +bool IsLstmSupportedRef(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported) +{ + ignore_unused(input); + ignore_unused(outputStateIn); + ignore_unused(cellStateIn); + ignore_unused(scratchBuffer); + ignore_unused(outputStateOut); + ignore_unused(cellStateOut); + ignore_unused(output); + ignore_unused(descriptor); + ignore_unused(inputToForgetWeights); + ignore_unused(inputToCellWeights); + ignore_unused(inputToOutputWeights); + ignore_unused(recurrentToForgetWeights); + ignore_unused(recurrentToCellWeights); + ignore_unused(recurrentToOutputWeights); + ignore_unused(forgetGateBias); + ignore_unused(cellBias); + ignore_unused(outputGateBias); + ignore_unused(inputToInputWeights); + ignore_unused(recurrentToInputWeights); + ignore_unused(cellToInputWeights); + ignore_unused(inputGateBias); + ignore_unused(projectionWeights); + ignore_unused(projectionBias); + ignore_unused(cellToForgetWeights); + ignore_unused(cellToOutputWeights); + return false; +} + +bool IsConvertFp16ToFp32SupportedRef(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + return (IsSupportedForDataTypeGeneric(reasonIfUnsupported, + input.GetDataType(), + &TrueFunc<>, + &FalseInputFuncF32<>, + &FalseFuncU8<>) && + IsSupportedForDataTypeGeneric(reasonIfUnsupported, + output.GetDataType(), + &FalseOutputFuncF16<>, + &TrueFunc<>, + &FalseFuncU8<>)); +} + +bool IsConvertFp32ToFp16SupportedRef(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + return (IsSupportedForDataTypeGeneric(reasonIfUnsupported, + input.GetDataType(), + &FalseInputFuncF16<>, + &TrueFunc<>, + &FalseFuncU8<>) && + IsSupportedForDataTypeGeneric(reasonIfUnsupported, + output.GetDataType(), + &TrueFunc<>, + &FalseOutputFuncF32<>, + &FalseFuncU8<>)); +} + } diff --git a/src/armnn/backends/RefLayerSupport.hpp b/src/armnn/backends/RefLayerSupport.hpp index 9db1c14596..5e543ac537 100644 --- a/src/armnn/backends/RefLayerSupport.hpp +++ b/src/armnn/backends/RefLayerSupport.hpp @@ -7,11 +7,14 @@ #include #include #include +#include +#include namespace armnn { bool IsActivationSupportedRef(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -21,6 +24,11 @@ bool IsAdditionSupportedRef(const TensorInfo& input0, std::string* reasonIfUnsupported = nullptr); bool IsBatchNormalizationSupportedRef(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -35,11 +43,16 @@ bool IsConvolution2dSupportedRef(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsDepthwiseConvolutionSupportedRef(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported = nullptr); bool IsFullyConnectedSupportedRef(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -47,14 +60,30 @@ bool IsInputSupportedRef(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsL2NormalizationSupportedRef(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); +bool IsLstmSupportedRef(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr); + bool IsMergerSupportedRef(const std::vector inputs, const OriginsDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); bool IsMultiplicationSupportedRef(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); bool IsNormalizationSupportedRef(const TensorInfo& input, @@ -79,6 +108,7 @@ bool IsResizeBilinearSupportedRef(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsSoftmaxSupportedRef(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -97,4 +127,12 @@ bool IsFloorSupportedRef(const TensorInfo& input, const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); +bool IsConvertFp16ToFp32SupportedRef(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + +bool IsConvertFp32ToFp16SupportedRef(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + } diff --git a/src/armnn/backends/RefWorkloadFactory.cpp b/src/armnn/backends/RefWorkloadFactory.cpp index d7d498e89e..9294c5accc 100644 --- a/src/armnn/backends/RefWorkloadFactory.cpp +++ b/src/armnn/backends/RefWorkloadFactory.cpp @@ -18,22 +18,15 @@ template RefWorkloadFactory::MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info) const { - if (!IsOperationQueueDescriptor(descriptor) || m_OperationWorkloadsAllowed) - { - return armnn::MakeWorkload(descriptor, info); - } - else - { - return std::unique_ptr(); - } + return armnn::MakeWorkload(descriptor, info); } -RefWorkloadFactory::RefWorkloadFactory(bool operationWorkloadsAllowed) - : m_OperationWorkloadsAllowed(operationWorkloadsAllowed) +RefWorkloadFactory::RefWorkloadFactory() { } -bool RefWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported) +bool RefWorkloadFactory::IsLayerSupported(const Layer& layer, boost::optional dataType, + std::string& outReasonIfUnsupported) { return IWorkloadFactory::IsLayerSupported(Compute::CpuRef, layer, dataType, outReasonIfUnsupported); } @@ -60,7 +53,7 @@ std::unique_ptr RefWorkloadFactory::CreateInput(const InputQueueDescr throw InvalidArgumentException("RefWorkloadFactory::CreateInput: data input and output differ in byte count."); } - return MakeWorkload(descriptor, info); + return MakeWorkload(descriptor, info); } std::unique_ptr RefWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor, @@ -79,7 +72,7 @@ std::unique_ptr RefWorkloadFactory::CreateOutput(const OutputQueueDes throw InvalidArgumentException("RefWorkloadFactory::CreateOutput: data input and output differ in byte count."); } - return MakeWorkload(descriptor, info); + return MakeWorkload(descriptor, info); } std::unique_ptr RefWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor, @@ -168,25 +161,7 @@ std::unique_ptr RefWorkloadFactory::CreateMemCopy(const MemCop { throw InvalidArgumentException("RefWorkloadFactory: CreateMemCopy() expected an input tensor."); } - // Create a workload that will copy tensor data from the inputs, which can have a number of different formats, - // to CPU tensors. - switch (descriptor.m_Inputs[0]->GetType()) - { -#if ARMCOMPUTECL_ENABLED - case ITensorHandle::CL: - { - return MakeWorkload(descriptor, info); - } -#endif -#if ARMCOMPUTENEON_ENABLED - case ITensorHandle::Neon: - { - return MakeWorkload(descriptor, info); - } -#endif - default: - throw InvalidArgumentException("RefWorkloadFactory: Destination type not supported for MemCopy Workload."); - } + return std::make_unique(descriptor, info); } std::unique_ptr RefWorkloadFactory::CreateResizeBilinear(const ResizeBilinearQueueDescriptor& descriptor, @@ -221,9 +196,29 @@ std::unique_ptr RefWorkloadFactory::CreateReshape(const ReshapeQueueD } std::unique_ptr RefWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor, - const WorkloadInfo& info) const + const WorkloadInfo& info) const { return MakeWorkload(descriptor, info); } +std::unique_ptr RefWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return MakeWorkload(descriptor, info); +} + +std::unique_ptr RefWorkloadFactory::CreateConvertFp16ToFp32( + const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique(descriptor, info); +} + +std::unique_ptr RefWorkloadFactory::CreateConvertFp32ToFp16( + const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique(descriptor, info); +} + } // namespace armnn diff --git a/src/armnn/backends/RefWorkloadFactory.hpp b/src/armnn/backends/RefWorkloadFactory.hpp index 3fab490ad8..ee8639f8ed 100644 --- a/src/armnn/backends/RefWorkloadFactory.hpp +++ b/src/armnn/backends/RefWorkloadFactory.hpp @@ -8,6 +8,7 @@ #include "OutputHandler.hpp" #include +#include namespace armnn { @@ -24,16 +25,17 @@ constexpr bool IsOperationQueueDescriptor(const ConstantQueueDescriptor&) { retu template <> constexpr bool IsOperationQueueDescriptor(const PermuteQueueDescriptor&) { return false; } -// Reference workload factory +// Reference workload factory. class RefWorkloadFactory : public IWorkloadFactory { public: - explicit RefWorkloadFactory(bool operationWorkloadsAllowed = true); - virtual ~RefWorkloadFactory() { }; + explicit RefWorkloadFactory(); + virtual ~RefWorkloadFactory() {} virtual Compute GetCompute() const override { return Compute::CpuRef; } - static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported); + static bool IsLayerSupported(const Layer& layer, boost::optional dataType, + std::string& outReasonIfUnsupported); virtual bool SupportsSubTensors() const override { return false; } @@ -43,7 +45,7 @@ public: { boost::ignore_unused(parent, subTensorShape, subTensorOrigin); return nullptr; - }; + } virtual std::unique_ptr CreateTensorHandle(const TensorInfo& tensorInfo) const override; @@ -113,12 +115,20 @@ public: virtual std::unique_ptr CreateFloor(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) const override; + virtual std::unique_ptr CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual std::unique_ptr CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual std::unique_ptr CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + private: template std::unique_ptr MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info) const; - const bool m_OperationWorkloadsAllowed; }; } // namespace armnn diff --git a/src/armnn/backends/RefWorkloads.hpp b/src/armnn/backends/RefWorkloads.hpp index ed4fa840da..1defdbbe82 100644 --- a/src/armnn/backends/RefWorkloads.hpp +++ b/src/armnn/backends/RefWorkloads.hpp @@ -52,3 +52,6 @@ #include "backends/RefWorkloads/Pooling2d.hpp" #include "backends/RefWorkloads/RefFakeQuantizationFloat32Workload.hpp" #include "backends/RefWorkloads/RefPermuteWorkload.hpp" +#include "backends/RefWorkloads/RefLstmFloat32Workload.hpp" +#include "backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp" +#include "backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp" diff --git a/src/armnn/backends/RefWorkloads/Activation.cpp b/src/armnn/backends/RefWorkloads/Activation.cpp index ede283cbf9..fdb6091ad7 100644 --- a/src/armnn/backends/RefWorkloads/Activation.cpp +++ b/src/armnn/backends/RefWorkloads/Activation.cpp @@ -24,7 +24,7 @@ void Activation(const float* in, float input = in[i]; float output; - // compute the result of the activation function + // Compute the result of the activation function. switch (function) { case ActivationFunction::Linear: diff --git a/src/armnn/backends/RefWorkloads/Activation.hpp b/src/armnn/backends/RefWorkloads/Activation.hpp index 874441c862..4ee604b462 100644 --- a/src/armnn/backends/RefWorkloads/Activation.hpp +++ b/src/armnn/backends/RefWorkloads/Activation.hpp @@ -9,7 +9,7 @@ namespace armnn { -/// Performs the ActivationFunction elementwise on the inputs to give the outputs +/// Performs the ActivationFunction elementwise on the inputs to give the outputs. void Activation(const float* in, float* out, const TensorInfo& tensorInfo, diff --git a/src/armnn/backends/RefWorkloads/Broadcast.hpp b/src/armnn/backends/RefWorkloads/Broadcast.hpp index b65b57f7a1..bdf03f2a16 100644 --- a/src/armnn/backends/RefWorkloads/Broadcast.hpp +++ b/src/armnn/backends/RefWorkloads/Broadcast.hpp @@ -43,7 +43,7 @@ struct BroadcastLoop } private: - // Struct to hold the dimension data + // Struct to hold the dimension data. struct BroadcastDimensionData { unsigned int m_DimSize; diff --git a/src/armnn/backends/RefWorkloads/ConvImpl.cpp b/src/armnn/backends/RefWorkloads/ConvImpl.cpp index 9ebadacddb..3dcd344101 100644 --- a/src/armnn/backends/RefWorkloads/ConvImpl.cpp +++ b/src/armnn/backends/RefWorkloads/ConvImpl.cpp @@ -46,7 +46,7 @@ int32_t QuantizedMultiplierSmallerThanOne::operator*(int32_t rhs) const int32_t QuantizedMultiplierSmallerThanOne::SaturatingRoundingDoublingHighMul(int32_t a, int32_t b) { - // Check for overflow + // Check for overflow. if (a == b && a == std::numeric_limits::min()) { return std::numeric_limits::max(); diff --git a/src/armnn/backends/RefWorkloads/ConvImpl.hpp b/src/armnn/backends/RefWorkloads/ConvImpl.hpp index 8b66b0b7d2..b7d5d17a8d 100644 --- a/src/armnn/backends/RefWorkloads/ConvImpl.hpp +++ b/src/armnn/backends/RefWorkloads/ConvImpl.hpp @@ -18,7 +18,7 @@ namespace armnn { -/// Performs multiplication of a integer with a multiplier which is less than one, +/// Performs multiplication of an integer with a multiplier which is less than one, /// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor. struct QuantizedMultiplierSmallerThanOne { @@ -28,21 +28,21 @@ public: /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne(). QuantizedMultiplierSmallerThanOne(float multiplier); - /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne() + /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne(). int32_t operator*(int32_t rhs) const; private: - /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul() + /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul(). static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b); - /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT() + /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT(). static int32_t RoundingDivideByPOT(int32_t x, int exponent); int32_t m_Multiplier; int32_t m_RightShift; }; -/// an implementation shared by normal and depthwise convolution +/// An implementation shared by normal and depthwise convolution. template static void ConvImpl(ConvData data, const InputType* inputData, @@ -55,6 +55,7 @@ static void ConvImpl(ConvData data, InputType* outputData, float outputScale, int32_t outputOffset, + const TensorInfo& filterInfo, bool depthwise = false) { if (data.m_Parameters.m_BiasEnabled && !biasData) @@ -64,7 +65,6 @@ static void ConvImpl(ConvData data, const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]); const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]); - const TensorInfo& filterInfo = data.m_Weight->GetTensorInfo(); unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1; unsigned int channelsInput = filterInfo.GetShape()[1]; @@ -84,7 +84,7 @@ static void ConvImpl(ConvData data, unsigned int hStride = data.m_Parameters.m_StrideY; unsigned int xStride = data.m_Parameters.m_StrideX; - // the world's least efficient convolution + // The world's least efficient convolution. for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++) { for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++) @@ -93,11 +93,11 @@ static void ConvImpl(ConvData data, { for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++) { - // this loop goes over each output element + // This loop goes over each output element. AccumulatorType sum = AccumulatorType(); - // for depthwise, each output channel corresponds to exactly one input channel - // for normal, must loop over each input channel + // For depthwise, each output channel corresponds to exactly one input channel. + // For normal, must loop over each input channel. for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++) { unsigned int depthwiseMultiplierIdx = 0; @@ -111,11 +111,11 @@ static void ConvImpl(ConvData data, { for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++) { - // this loop goes over each input element for each output element + // This loop goes over each input element for each output element. unsigned int filterIndex; - // since dimensionality of kernel depends on depthwiseness, so does index + // Since dimensionality of kernel depends on depthwiseness, so does index. if (depthwise) { filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter * channelsInput + @@ -138,7 +138,7 @@ static void ConvImpl(ConvData data, AccumulatorType inputValue; - // check if we're in the padding + // Check if we're in the padding. if (yInput < paddingTop || yInput >= heightInput + paddingTop || xInput < paddingLeft || xInput >= widthInput + paddingLeft ) { diff --git a/src/armnn/backends/RefWorkloads/FullyConnected.cpp b/src/armnn/backends/RefWorkloads/FullyConnected.cpp index 8ba11d19c6..1a8263b9a1 100644 --- a/src/armnn/backends/RefWorkloads/FullyConnected.cpp +++ b/src/armnn/backends/RefWorkloads/FullyConnected.cpp @@ -18,11 +18,11 @@ void FullyConnected(const float* inputData, const float* biasData, bool transposeWeights) { - unsigned int N = outputTensorInfo.GetShape()[1]; // Output Vector Size + unsigned int N = outputTensorInfo.GetShape()[1]; // Outputs Vector Size. - BOOST_ASSERT(inputTensorInfo.GetNumDimensions() > 1); // Need some data + BOOST_ASSERT(inputTensorInfo.GetNumDimensions() > 1); // Needs some data. - unsigned int K = 1; // Total number of activations in the input + unsigned int K = 1; // Total number of activations in the input. for (unsigned int i = 1; i < inputTensorInfo.GetNumDimensions(); i++) { K *= inputTensorInfo.GetShape()[i]; diff --git a/src/armnn/backends/RefWorkloads/FullyConnected.hpp b/src/armnn/backends/RefWorkloads/FullyConnected.hpp index 9fa2456110..fa6f54a3ec 100644 --- a/src/armnn/backends/RefWorkloads/FullyConnected.hpp +++ b/src/armnn/backends/RefWorkloads/FullyConnected.hpp @@ -10,7 +10,7 @@ namespace armnn { -/// Performs a matrix multiplication and optionally adds a bias +/// Performs a matrix multiplication and optionally adds a bias. void FullyConnected(const float* inputData, float* outputData, const TensorInfo& inputTensorInfo, diff --git a/src/armnn/backends/RefWorkloads/Merger.hpp b/src/armnn/backends/RefWorkloads/Merger.hpp index 7d1bfab557..1294d05e08 100644 --- a/src/armnn/backends/RefWorkloads/Merger.hpp +++ b/src/armnn/backends/RefWorkloads/Merger.hpp @@ -29,7 +29,7 @@ void Merger(const MergerQueueDescriptor& data) for (unsigned int i=0; i(0, data))[index] = (GetInputTensorData(viewIdx, data))[inIndex]; - //what should we do if input views overlap on the output tensor? - //we could error, take the average, or shm else... - //for now just stop after finding first view (input) that matches. + //What should we do if input views overlap on the output tensor? + //We could error, take the average, or shm else... + //For now just stop after finding first view (input) that matches. break; } } diff --git a/src/armnn/backends/RefWorkloads/Pooling2d.cpp b/src/armnn/backends/RefWorkloads/Pooling2d.cpp index a643e67690..4047f061b3 100644 --- a/src/armnn/backends/RefWorkloads/Pooling2d.cpp +++ b/src/armnn/backends/RefWorkloads/Pooling2d.cpp @@ -164,7 +164,7 @@ void Pooling2d(const float* in, Executor execute = GetExecutor(params.m_PoolType); // Check supported padding methods outside the loop to simplify - // the inner loop + // the inner loop. if (params.m_PaddingMethod != PaddingMethod::Exclude && params.m_PaddingMethod != PaddingMethod::IgnoreValue) { @@ -192,7 +192,7 @@ void Pooling2d(const float* in, float result = defaultInitializer; float poolAreaSize = boost::numeric_cast((hend - hstart) * (wend - wstart)); - // special case: when the pooling kernel is over a padding region and the padding + // Special case: when the pooling kernel is over a padding region and the padding // size is larger or equal to the kernel and the kernel only covers // padding and no real values, then we initialize the result as zero // by convention. This is because we need to choose a value here and @@ -208,8 +208,8 @@ void Pooling2d(const float* in, if (clamped && params.m_PaddingMethod == PaddingMethod::Exclude) { - // when we exclude the padding, it means we calculate with a smaller - // kernel size, so I change the divisor here + // When we exclude the padding, it means we calculate with a smaller + // kernel size, so I changed the divisor here. poolAreaSize = boost::numeric_cast((hend - hstart) * (wend - wstart)); } diff --git a/src/armnn/backends/RefWorkloads/Pooling2d.hpp b/src/armnn/backends/RefWorkloads/Pooling2d.hpp index f88b1a0a4e..cefd022fb3 100644 --- a/src/armnn/backends/RefWorkloads/Pooling2d.hpp +++ b/src/armnn/backends/RefWorkloads/Pooling2d.hpp @@ -11,7 +11,7 @@ namespace armnn { -/// Computes the Pooling2d operation +/// Computes the Pooling2d operation. void Pooling2d(const float* in, float* out, const TensorInfo& inputInfo, diff --git a/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp b/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp index 0ede46d9fb..9044fca1c2 100644 --- a/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp +++ b/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp @@ -13,7 +13,7 @@ namespace armnn { -// Base class template providing an implementation of the Constant layer common to all data types +// Base class template providing an implementation of the Constant layer common to all data types. template class RefBaseConstantWorkload : public TypedWorkload { diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp index c421b0f212..fbc1f07111 100644 --- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp @@ -12,15 +12,22 @@ namespace armnn { +RefBatchNormalizationFloat32Workload::RefBatchNormalizationFloat32Workload( + const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) + : Float32Workload(descriptor, info), + m_Mean(std::make_unique(*(descriptor.m_Mean))), + m_Variance(std::make_unique(*(descriptor.m_Variance))), + m_Beta(std::make_unique(*(descriptor.m_Beta))), + m_Gamma(std::make_unique(*(descriptor.m_Gamma))) {} void RefBatchNormalizationFloat32Workload::Execute() const { ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefBatchNormalizationFloat32Workload_Execute"); - const float* var = m_Data.m_Variance->GetConstTensor(); - const float* mean = m_Data.m_Mean->GetConstTensor(); - const float* gamma = m_Data.m_Gamma->GetConstTensor(); - const float* beta = m_Data.m_Beta->GetConstTensor(); + const float* var = m_Variance->GetConstTensor(); + const float* mean = m_Mean->GetConstTensor(); + const float* gamma = m_Gamma->GetConstTensor(); + const float* beta = m_Beta->GetConstTensor(); auto inputData = GetInputTensorDataFloat(0, m_Data); auto outputData = GetOutputTensorDataFloat(0, m_Data); diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp index cbcdadd749..780c329cc6 100644 --- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp @@ -14,8 +14,15 @@ namespace armnn class RefBatchNormalizationFloat32Workload : public Float32Workload { public: - using Float32Workload::Float32Workload; + explicit RefBatchNormalizationFloat32Workload(const BatchNormalizationQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr m_Mean; + std::unique_ptr m_Variance; + std::unique_ptr m_Beta; + std::unique_ptr m_Gamma; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp index 8a48523765..4a8e296619 100644 --- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp @@ -14,23 +14,30 @@ namespace armnn { +RefBatchNormalizationUint8Workload::RefBatchNormalizationUint8Workload( + const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) + : Uint8Workload(descriptor, info), + m_Mean(std::make_unique(*(descriptor.m_Mean))), + m_Variance(std::make_unique(*(descriptor.m_Variance))), + m_Beta(std::make_unique(*(descriptor.m_Beta))), + m_Gamma(std::make_unique(*(descriptor.m_Gamma))) {} void RefBatchNormalizationUint8Workload::Execute() const { ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefBatchNormalizationUint8Workload_Execute"); const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]); - const TensorInfo& varInfo = GetTensorInfo(m_Data.m_Variance); - const TensorInfo& meanInfo = GetTensorInfo(m_Data.m_Mean); - const TensorInfo& gammaInfo = GetTensorInfo(m_Data.m_Gamma); - const TensorInfo& betaInfo = GetTensorInfo(m_Data.m_Beta); + const TensorInfo& varInfo = GetTensorInfo(m_Variance.get()); + const TensorInfo& meanInfo = GetTensorInfo(m_Mean.get()); + const TensorInfo& gammaInfo = GetTensorInfo(m_Gamma.get()); + const TensorInfo& betaInfo = GetTensorInfo(m_Beta.get()); const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]); auto input = Dequantize(GetInputTensorDataU8(0, m_Data), inputInfo0); - auto var = Dequantize(m_Data.m_Variance->GetConstTensor(), varInfo); - auto mean = Dequantize(m_Data.m_Mean->GetConstTensor(), meanInfo); - auto gamma = Dequantize(m_Data.m_Gamma->GetConstTensor(), gammaInfo); - auto beta = Dequantize(m_Data.m_Beta->GetConstTensor(), betaInfo); + auto var = Dequantize(m_Variance->GetConstTensor(), varInfo); + auto mean = Dequantize(m_Mean->GetConstTensor(), meanInfo); + auto gamma = Dequantize(m_Gamma->GetConstTensor(), gammaInfo); + auto beta = Dequantize(m_Beta->GetConstTensor(), betaInfo); std::vector results(outputInfo.GetNumElements()); BatchNormImpl(m_Data, var.data(), mean.data(), gamma.data(), beta.data(), results.data(), input.data()); diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp index 57fe995ba5..2c12d28c3f 100644 --- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp @@ -14,8 +14,15 @@ namespace armnn class RefBatchNormalizationUint8Workload : public Uint8Workload { public: - using Uint8Workload::Uint8Workload; + explicit RefBatchNormalizationUint8Workload(const BatchNormalizationQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr m_Mean; + std::unique_ptr m_Variance; + std::unique_ptr m_Beta; + std::unique_ptr m_Gamma; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp new file mode 100644 index 0000000000..c4b78014b2 --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp @@ -0,0 +1,25 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "RefConvertFp16ToFp32Workload.hpp" +#include "Half.hpp" +#include "RefWorkloadUtils.hpp" +#include "FloatingPointConverter.hpp" + +namespace armnn +{ + +void RefConvertFp16ToFp32Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertFp16ToFp32Workload_Execute"); + + const Half* const input = GetInputTensorDataHalf(0, m_Data); + float* const output = GetOutputTensorDataFloat(0, m_Data); + + unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements(); + armnnUtils::FloatingPointConverter::ConvertFloat16To32(input, numElements, output); +} + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp new file mode 100644 index 0000000000..34ae35545b --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp @@ -0,0 +1,21 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" + +namespace armnn +{ + +class RefConvertFp16ToFp32Workload : public Float16ToFloat32Workload +{ +public: + using Float16ToFloat32Workload::Float16ToFloat32Workload; + virtual void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp new file mode 100644 index 0000000000..3c93297302 --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp @@ -0,0 +1,29 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "RefConvertFp32ToFp16Workload.hpp" + +#include "Half.hpp" +#include "FloatingPointConverter.hpp" +#include "RefWorkloadUtils.hpp" + +#include "Profiling.hpp" + +namespace armnn +{ + +void RefConvertFp32ToFp16Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertFp32ToFp16Workload_Execute"); + + const float* const input = GetInputTensorDataFloat(0, m_Data); + Half* const output = GetOutputTensorDataHalf(0, m_Data); + + // convert Fp32 input to Fp16 output + unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements(); + armnnUtils::FloatingPointConverter::ConvertFloat32To16(input, numElements, output); +} + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp new file mode 100644 index 0000000000..903a50449f --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp @@ -0,0 +1,21 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" + +namespace armnn +{ + +class RefConvertFp32ToFp16Workload : public Float32ToFloat16Workload +{ +public: + using Float32ToFloat16Workload::Float32ToFloat16Workload; + virtual void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp index 6e4cc69063..4fe823a288 100644 --- a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp @@ -12,6 +12,12 @@ namespace armnn { +RefConvolution2dFloat32Workload::RefConvolution2dFloat32Workload( + const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) + : Float32Workload(descriptor, info), + m_Weight(std::make_unique(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique(*(descriptor.m_Bias)) : nullptr) {} void RefConvolution2dFloat32Workload::Execute() const { @@ -19,12 +25,13 @@ void RefConvolution2dFloat32Workload::Execute() const float* outputData = GetOutputTensorDataFloat(0, m_Data); const float* inputData = GetInputTensorDataFloat(0, m_Data); - const float* weightData = m_Data.m_Weight->template GetConstTensor(); + const float* weightData = m_Weight->template GetConstTensor(); const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Data.m_Bias->template GetConstTensor() : nullptr; + m_Bias->template GetConstTensor() : nullptr; + const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); ConvImpl( - m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0); + m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo); } } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp index 514369c262..ecf0082f33 100644 --- a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp @@ -14,8 +14,14 @@ namespace armnn class RefConvolution2dFloat32Workload : public Float32Workload { public: - using Float32Workload::Float32Workload; + explicit RefConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr m_Weight; + std::unique_ptr m_Bias; + }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp index f390baa387..19e9c2ed0a 100644 --- a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp @@ -12,6 +12,12 @@ namespace armnn { +RefConvolution2dUint8Workload::RefConvolution2dUint8Workload( + const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) + : Uint8Workload(descriptor, info), + m_Weight(std::make_unique(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique(*(descriptor.m_Bias)) : nullptr) {} void RefConvolution2dUint8Workload::Execute() const { @@ -19,20 +25,21 @@ void RefConvolution2dUint8Workload::Execute() const const uint8_t* inputData = GetInputTensorDataU8(0, m_Data); const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]); - const uint8_t* weightsData = m_Data.m_Weight->template GetConstTensor(); - const TensorInfo& weightsInfo = GetTensorInfo(m_Data.m_Weight); + const uint8_t* weightsData = m_Weight->template GetConstTensor(); + const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get()); const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Data.m_Bias->template GetConstTensor() : + m_Bias->template GetConstTensor() : nullptr; uint8_t* outputData = GetOutputTensorDataU8(0, m_Data); const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]); + const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); ConvImpl( m_Data, inputData, inputInfo.GetQuantizationScale(), inputInfo.GetQuantizationOffset(), weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(), biasData, - outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset()); + outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo); } } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp index 954a206463..733d2052b2 100644 --- a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp @@ -14,8 +14,15 @@ namespace armnn class RefConvolution2dUint8Workload : public Uint8Workload { public: - using Uint8Workload::Uint8Workload; + explicit RefConvolution2dUint8Workload(const Convolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); + virtual void Execute() const override; + +private: + std::unique_ptr m_Weight; + std::unique_ptr m_Bias; + }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp index c631fecb66..f3167e299a 100644 --- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp @@ -12,6 +12,12 @@ namespace armnn { +RefDepthwiseConvolution2dFloat32Workload::RefDepthwiseConvolution2dFloat32Workload( + const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) + : Float32Workload(descriptor, info), + m_Weight(std::make_unique(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique(*(descriptor.m_Bias)) : nullptr) {} void RefDepthwiseConvolution2dFloat32Workload::Execute() const { @@ -19,12 +25,13 @@ void RefDepthwiseConvolution2dFloat32Workload::Execute() const float* outputData = GetOutputTensorDataFloat(0, m_Data); const float* inputData = GetInputTensorDataFloat(0, m_Data); - const float* weightData = m_Data.m_Weight->template GetConstTensor(); + const float* weightData = m_Weight->template GetConstTensor(); const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Data.m_Bias->template GetConstTensor() : nullptr; + m_Bias->template GetConstTensor() : nullptr; + const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); ConvImpl - (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, true); + (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo, true); } } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp index 34e6524684..042e7b3c0a 100644 --- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp @@ -14,8 +14,14 @@ namespace armnn class RefDepthwiseConvolution2dFloat32Workload : public Float32Workload { public: - using Float32Workload::Float32Workload; + explicit RefDepthwiseConvolution2dFloat32Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); + virtual void Execute() const override; + +private: + std::unique_ptr m_Weight; + std::unique_ptr m_Bias; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp index 5a8fb13112..fd5ade5559 100644 --- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp @@ -13,26 +13,34 @@ namespace armnn { +RefDepthwiseConvolution2dUint8Workload::RefDepthwiseConvolution2dUint8Workload( + const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) + : Uint8Workload(descriptor, info), + m_Weight(std::make_unique(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique(*(descriptor.m_Bias)) : nullptr) {} + void RefDepthwiseConvolution2dUint8Workload::Execute() const { ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDepthwiseConvolution2dUint8Workload_Execute"); const uint8_t* inputData = GetInputTensorDataU8(0, m_Data); const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]); - const uint8_t* weightsData = m_Data.m_Weight->template GetConstTensor(); - const TensorInfo& weightsInfo = GetTensorInfo(m_Data.m_Weight); + const uint8_t* weightsData = m_Weight->template GetConstTensor(); + const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get()); const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Data.m_Bias->template GetConstTensor() : + m_Bias->template GetConstTensor() : nullptr; uint8_t* outputData = GetOutputTensorDataU8(0, m_Data); const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]); + const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); ConvImpl( m_Data, inputData, inputInfo.GetQuantizationScale(), inputInfo.GetQuantizationOffset(), weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(), biasData, - outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), true); + outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true); } } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp index bd9945f529..2c8ed2d084 100644 --- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp @@ -14,8 +14,13 @@ namespace armnn class RefDepthwiseConvolution2dUint8Workload : public Uint8Workload { public: - using Uint8Workload::Uint8Workload; + explicit RefDepthwiseConvolution2dUint8Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr m_Weight; + std::unique_ptr m_Bias; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp index 6fe203e5f0..818455e0e9 100644 --- a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp @@ -12,6 +12,12 @@ namespace armnn { +RefFullyConnectedFloat32Workload::RefFullyConnectedFloat32Workload( + const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) + : Float32Workload(descriptor, info), + m_Weight(std::make_unique(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique(*(descriptor.m_Bias)) : nullptr) {} void RefFullyConnectedFloat32Workload::Execute() const { @@ -22,8 +28,8 @@ void RefFullyConnectedFloat32Workload::Execute() const float* outputData = GetOutputTensorDataFloat(0, m_Data); const float* inputData = GetInputTensorDataFloat(0, m_Data); - const float* weightData = m_Data.m_Weight->GetConstTensor(); - const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Data.m_Bias->GetConstTensor() : nullptr; + const float* weightData = m_Weight->GetConstTensor(); + const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->GetConstTensor() : nullptr; FullyConnected(inputData, outputData, diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp index cb835bd2ce..639d935a16 100644 --- a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp @@ -14,8 +14,13 @@ namespace armnn class RefFullyConnectedFloat32Workload : public Float32Workload { public: - using Float32Workload::Float32Workload; + explicit RefFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr m_Weight; + std::unique_ptr m_Bias; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp index 0186d3f5e5..cd653657e1 100644 --- a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp @@ -14,6 +14,12 @@ namespace armnn { +RefFullyConnectedUint8Workload::RefFullyConnectedUint8Workload( + const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) + : Uint8Workload(descriptor, info), + m_Weight(std::make_unique(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique(*(descriptor.m_Bias)) : nullptr) {} void RefFullyConnectedUint8Workload::Execute() const { @@ -22,18 +28,18 @@ void RefFullyConnectedUint8Workload::Execute() const const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]); const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]); - const uint8_t* weightData = m_Data.m_Weight->GetConstTensor(); + const uint8_t* weightData = m_Weight->GetConstTensor(); auto dequant = Dequantize(GetInputTensorDataU8(0, m_Data), inputInfo); - auto weight = Dequantize(weightData, m_Data.m_Weight->GetTensorInfo()); + auto weight = Dequantize(weightData, m_Weight->GetTensorInfo()); - std::vector results(inputInfo.GetNumElements()); + std::vector results(outputInfo.GetNumElements()); if (m_Data.m_Parameters.m_BiasEnabled) { - const int32_t* biasData = m_Data.m_Bias->GetConstTensor(); - auto bias = Dequantize(biasData, m_Data.m_Bias->GetTensorInfo()); + const int32_t* biasData = m_Bias->GetConstTensor(); + auto bias = Dequantize(biasData, m_Bias->GetTensorInfo()); FullyConnected(dequant.data(), results.data(), diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp index cd14ea85e0..36e5f631ad 100644 --- a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp @@ -14,8 +14,13 @@ namespace armnn class RefFullyConnectedUint8Workload : public Uint8Workload { public: - using Uint8Workload::Uint8Workload; + explicit RefFullyConnectedUint8Workload(const FullyConnectedQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr m_Weight; + std::unique_ptr m_Bias; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp new file mode 100644 index 0000000000..bc33638310 --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp @@ -0,0 +1,16 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "RefLstmFloat32Workload.hpp" + +namespace armnn +{ + +void RefLstmFloat32Workload::Execute() const +{ + throw armnn::Exception("No implementation of Lstm in the Ref backend!"); +} + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp new file mode 100644 index 0000000000..0acce4d309 --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp @@ -0,0 +1,21 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" + +namespace armnn +{ + +class RefLstmFloat32Workload : public Float32Workload +{ +public: + using Float32Workload::Float32Workload; + virtual void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp index c743207423..f4dff60ae4 100644 --- a/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp @@ -17,7 +17,7 @@ namespace armnn { -// Helper function to compute "Within" normalization using Krichevsky 2012: Local Brightness Normalization +// Helper function to compute "Within" normalization using Krichevsky 2012: Local Brightness Normalization. static void NormalizeWithinUingLbr(const float* inputData, float* outputData, const TensorShape& tensorShape, @@ -80,7 +80,7 @@ static void NormalizeWithinUingLbr(const float* inputData, } } -// Helper function to compute "Across" normalization using Krichevsky 2012: Local Brightness Normalization +// Helper function to compute "Across" normalization using Krichevsky 2012: Local Brightness Normalization. void NormalizeAcrossUingLbr(const float* inputData, float* outputData, const TensorShape& tensorShape, diff --git a/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp b/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp index b2bb8fbf3d..93c883d826 100644 --- a/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp +++ b/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp @@ -7,6 +7,7 @@ #include "RefWorkloadUtils.hpp" #include +#include "TypeUtils.hpp" namespace armnn { diff --git a/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp b/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp index 088fe819e5..1df735ea55 100644 --- a/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp +++ b/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp @@ -9,6 +9,7 @@ #include #include +#include #include @@ -70,6 +71,18 @@ float* GetOutputTensorDataFloat(unsigned int idx, const PayloadType& data) return GetOutputTensorData(idx, data); } +template +const Half* GetInputTensorDataHalf(unsigned int idx, const PayloadType& data) +{ + return GetInputTensorData(idx, data); +} + +template +Half* GetOutputTensorDataHalf(unsigned int idx, const PayloadType& data) +{ + return GetOutputTensorData(idx, data); +} + //////////////////////////////////////////// /// u8 helpers //////////////////////////////////////////// diff --git a/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp b/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp index 7b386ed467..d8bca4be44 100644 --- a/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp +++ b/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp @@ -27,7 +27,7 @@ inline float Lerp(float a, float b, float w) void ResizeBilinear(const float* in, const TensorInfo& inputInfo, float* out, const TensorInfo& outputInfo) { - // We follow the definition of TensorFlow and AndroidNN: The top-left corner of a texel in the output + // We follow the definition of TensorFlow and AndroidNN: the top-left corner of a texel in the output // image is projected into the input image to figure out the interpolants and weights. Note that this // will yield different results than if projecting the centre of output texels. @@ -39,8 +39,8 @@ void ResizeBilinear(const float* in, const TensorInfo& inputInfo, float* out, co const unsigned int outputHeight = outputInfo.GetShape()[2]; const unsigned int outputWidth = outputInfo.GetShape()[3]; - // How much to scale pixel coordinates in the output image to get the corresponding pixel coordinates - // in the input image + // How much to scale pixel coordinates in the output image, to get the corresponding pixel coordinates + // in the input image. const float scaleY = boost::numeric_cast(inputHeight) / boost::numeric_cast(outputHeight); const float scaleX = boost::numeric_cast(inputWidth) / boost::numeric_cast(outputWidth); @@ -53,33 +53,33 @@ void ResizeBilinear(const float* in, const TensorInfo& inputInfo, float* out, co { for (unsigned int y = 0; y < outputHeight; ++y) { - // Corresponding real-valued height coordinate in input image + // Corresponding real-valued height coordinate in input image. const float iy = boost::numeric_cast(y) * scaleY; - // Discrete height coordinate of top-left texel (in the 2x2 texel area used for interpolation) + // Discrete height coordinate of top-left texel (in the 2x2 texel area used for interpolation). const float fiy = floorf(iy); const unsigned int y0 = boost::numeric_cast(fiy); - // Interpolation weight (range [0,1]) + // Interpolation weight (range [0,1]). const float yw = iy - fiy; for (unsigned int x = 0; x < outputWidth; ++x) { - // Real-valued and discrete width coordinates in input image + // Real-valued and discrete width coordinates in input image. const float ix = boost::numeric_cast(x) * scaleX; const float fix = floorf(ix); const unsigned int x0 = boost::numeric_cast(fix); - // Interpolation weight (range [0,1]) + // Interpolation weight (range [0,1]). const float xw = ix - fix; - // Discrete width/height coordinates of texels below and to the right of (x0, y0) + // Discrete width/height coordinates of texels below and to the right of (x0, y0). const unsigned int x1 = std::min(x0 + 1, inputWidth - 1u); const unsigned int y1 = std::min(y0 + 1, inputHeight - 1u); // Interpolation - const float ly0 = Lerp(input.Get(n, c, y0, x0), input.Get(n, c, y0, x1), xw); // lerp along row y0 - const float ly1 = Lerp(input.Get(n, c, y1, x0), input.Get(n, c, y1, x1), xw); // lerp along row y1 + const float ly0 = Lerp(input.Get(n, c, y0, x0), input.Get(n, c, y0, x1), xw); // lerp along row y0. + const float ly1 = Lerp(input.Get(n, c, y1, x0), input.Get(n, c, y1, x1), xw); // lerp along row y1. const float l = Lerp(ly0, ly1, yw); output.Get(n, c, y, x) = l; diff --git a/src/armnn/backends/RefWorkloads/Softmax.cpp b/src/armnn/backends/RefWorkloads/Softmax.cpp index 58840e3076..c9f0bc5e59 100644 --- a/src/armnn/backends/RefWorkloads/Softmax.cpp +++ b/src/armnn/backends/RefWorkloads/Softmax.cpp @@ -11,13 +11,13 @@ namespace armnn { -/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo +/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo. void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float beta) { unsigned int numChannels = tensorInfo.GetShape()[1]; for (unsigned int n = 0; n < tensorInfo.GetShape()[0]; n++) { - // find maximum channel + // Find maximum channel. float max = in[n * numChannels]; for (unsigned int c = 1; c < numChannels; c++) { @@ -28,7 +28,7 @@ void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float be } } - // exponentiate all values and sum + // Exponentiate all values and sum. std::vector exponentials(numChannels); float sum = 0.0f; for (unsigned int c = 0; c < numChannels; c++) @@ -38,7 +38,7 @@ void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float be sum += exponentials[c]; } - // divide exponentials by sum to give outputs + // Divide exponentials by sum to give outputs. for (unsigned int c = 0; c < numChannels; c++) { out[n * numChannels + c] = exponentials[c] / sum; diff --git a/src/armnn/backends/RefWorkloads/Softmax.hpp b/src/armnn/backends/RefWorkloads/Softmax.hpp index c508ab2b82..f75388dc2b 100644 --- a/src/armnn/backends/RefWorkloads/Softmax.hpp +++ b/src/armnn/backends/RefWorkloads/Softmax.hpp @@ -10,7 +10,7 @@ namespace armnn { -/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo +/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo. void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float beta); } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/Splitter.hpp b/src/armnn/backends/RefWorkloads/Splitter.hpp index bd5da6cfe2..c12d9368bf 100644 --- a/src/armnn/backends/RefWorkloads/Splitter.hpp +++ b/src/armnn/backends/RefWorkloads/Splitter.hpp @@ -31,7 +31,7 @@ void Splitter(const SplitterQueueDescriptor& data) for (unsigned int i = 0; i(viewIdx, data); BOOST_ASSERT(outputData); diff --git a/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp b/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp index 3994c1f1de..ad0f38e867 100644 --- a/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp +++ b/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp @@ -10,7 +10,7 @@ namespace armnn { -// Utility class providing access to raw tensor memory based on indices along each dimension +// Utility class providing access to raw tensor memory based on indices along each dimension. template class TensorBufferArrayView { diff --git a/src/armnn/backends/Workload.hpp b/src/armnn/backends/Workload.hpp index dbc7574d0e..5da03bc61d 100644 --- a/src/armnn/backends/Workload.hpp +++ b/src/armnn/backends/Workload.hpp @@ -12,11 +12,11 @@ namespace armnn { -// Workload interface to enqueue a layer computation +// Workload interface to enqueue a layer computation. class IWorkload { public: - virtual ~IWorkload(){}; + virtual ~IWorkload() {} virtual void Execute() const = 0; }; @@ -46,35 +46,102 @@ protected: const QueueDescriptor m_Data; }; -template +// TypedWorkload used +template class TypedWorkload : public BaseWorkload { public: TypedWorkload(const QueueDescriptor& descriptor, const WorkloadInfo& info) : BaseWorkload(descriptor, info) + { + std::vector dataTypes = {DataTypes...}; + armnn::DataType expectedInputType; + + if (!info.m_InputTensorInfos.empty()) + { + expectedInputType = info.m_InputTensorInfos.front().GetDataType(); + + if (std::find(dataTypes.begin(), dataTypes.end(), expectedInputType) == dataTypes.end()) + { + BOOST_ASSERT_MSG(false, "Trying to create workload with incorrect type"); + } + BOOST_ASSERT_MSG(std::all_of(std::next(info.m_InputTensorInfos.begin()), + info.m_InputTensorInfos.end(), + [&](auto it){ + return it.GetDataType() == expectedInputType; + }), + "Trying to create workload with incorrect type"); + } + armnn::DataType expectedOutputType; + + if (!info.m_OutputTensorInfos.empty()) + { + expectedOutputType = info.m_OutputTensorInfos.front().GetDataType(); + + if (!info.m_InputTensorInfos.empty()) + { + if (expectedOutputType != expectedInputType) + { + BOOST_ASSERT_MSG(false, "Trying to create workload with incorrect type"); + } + } + else if (std::find(dataTypes.begin(), dataTypes.end(), expectedOutputType) == dataTypes.end()) + { + BOOST_ASSERT_MSG(false, "Trying to create workload with incorrect type"); + } + BOOST_ASSERT_MSG(std::all_of(std::next(info.m_OutputTensorInfos.begin()), + info.m_OutputTensorInfos.end(), + [&](auto it){ + return it.GetDataType() == expectedOutputType; + }), + "Trying to create workload with incorrect type"); + } + } +}; + +template +class MultiTypedWorkload : public BaseWorkload +{ +public: + + MultiTypedWorkload(const QueueDescriptor& descriptor, const WorkloadInfo& info) + : BaseWorkload(descriptor, info) { BOOST_ASSERT_MSG(std::all_of(info.m_InputTensorInfos.begin(), info.m_InputTensorInfos.end(), [&](auto it){ - return it.GetDataType() == DataType; + return it.GetDataType() == InputDataType; }), "Trying to create workload with incorrect type"); BOOST_ASSERT_MSG(std::all_of(info.m_OutputTensorInfos.begin(), info.m_OutputTensorInfos.end(), [&](auto it){ - return it.GetDataType() == DataType; + return it.GetDataType() == OutputDataType; }), "Trying to create workload with incorrect type"); } - - static constexpr armnn::DataType ms_DataType = DataType; }; +template +using FloatWorkload = TypedWorkload; + template using Float32Workload = TypedWorkload; template using Uint8Workload = TypedWorkload; +template +using Float16ToFloat32Workload = MultiTypedWorkload; + +template +using Float32ToFloat16Workload = MultiTypedWorkload; + } //namespace armnn diff --git a/src/armnn/backends/WorkloadData.cpp b/src/armnn/backends/WorkloadData.cpp index c951fc5d8d..aa763801ce 100644 --- a/src/armnn/backends/WorkloadData.cpp +++ b/src/armnn/backends/WorkloadData.cpp @@ -22,6 +22,8 @@ DataType GetBiasDataType(DataType inputDataType) { switch (inputDataType) { + case DataType::Float16: + return DataType::Float16; case DataType::Float32: return DataType::Float32; case DataType::QuantisedAsymm8: @@ -148,7 +150,7 @@ void ValidateBiasTensorQuantization(const TensorInfo& biasTensor, const TensorIn to_string(biasTensor.GetQuantizationOffset())); } const float expectedScale = inputTensorInfo.GetQuantizationScale() * weightsTensorInfo.GetQuantizationScale(); - if (biasTensor.GetQuantizationScale() != expectedScale) + if (std::abs(biasTensor.GetQuantizationScale() - expectedScale) > 0.000000001f) { // Print the float values with extra precision to see very small differences std::stringstream msg; @@ -338,11 +340,11 @@ void SplitterQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const ". Number of workloadInfo.m_OutputTensorInfos: " + to_string(workloadInfo.m_OutputTensorInfos.size())); } - //the dimensionality of all the windows has to match the dimensionality (not shape) of the input + //The dimensionality of all the windows has to match the dimensionality (not shape) of the input. std::size_t inputDims = workloadInfo.m_InputTensorInfos[0].GetNumDimensions(); for(unsigned int w = 0; w < m_ViewOrigins.size(); ++w ) { - //check that the dimensionality of input is same as the split windows + //Checks that the dimensionality of input is same as the split windows. ViewOrigin const& e = m_ViewOrigins[w]; if (e.m_Origin.size() != inputDims) { @@ -399,11 +401,11 @@ void MergerQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const ". Number of workloadInfo.m_InputTensorInfos: " + to_string(workloadInfo.m_InputTensorInfos.size())); } - //the dimensionality of all the windows has to match the dimensionality (not shape) of the output + //The dimensionality of all the windows has to match the dimensionality (not shape) of the output. std::size_t outputDims = workloadInfo.m_OutputTensorInfos[0].GetNumDimensions(); for(unsigned int w = 0; w < m_ViewOrigins.size(); ++w ) { - //check that the dimensionality of output is same as the split windows + //Checks that the dimensionality of output is same as the split windows. ViewOrigin const& e = m_ViewOrigins[w]; if (e.m_Origin.size() != outputDims) { @@ -415,7 +417,7 @@ void MergerQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const "tensor has " + to_string(outputDims) + " dimensions."); } - //check that the merge windows are within the output tensor + //Checks that the merge windows are within the output tensor. for (unsigned int i = 0; i < e.m_Origin.size(); ++i) { if (e.m_Origin[i] + workloadInfo.m_InputTensorInfos[w].GetShape()[i] @@ -456,7 +458,7 @@ void FullyConnectedQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) c "bias value tensor descriptor is missing."); } - // validate type and quantization values + // Validates type and quantization values. ValidateBiasTensorQuantization(m_Bias->GetTensorInfo(), workloadInfo.m_InputTensorInfos[0], m_Weight->GetTensorInfo(), "FullyConnectedQueueDescriptor"); @@ -578,7 +580,7 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa ValidatePointer(m_Weight, "DepthwiseConvolution2dQueueDescriptor", "weight"); ValidateTensorNumDimensions(m_Weight->GetTensorInfo(), "DepthwiseConvolution2dQueueDescriptor", 4, "weight"); - //inputChannels * channelMultiplier should be equal to outputChannels + //inputChannels * channelMultiplier should be equal to outputChannels. const unsigned int numWeightChannelMultiplier = m_Weight->GetTensorInfo().GetShape()[0]; const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[1]; const unsigned int numWeightOutputChannels = workloadInfo.m_OutputTensorInfos[0].GetShape()[1]; @@ -649,7 +651,7 @@ void ResizeBilinearQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) c ValidateTensorNumDimensions(workloadInfo.m_InputTensorInfos[0], "ResizeBilinearQueueDescriptor", 4, "input"); ValidateTensorNumDimensions(workloadInfo.m_OutputTensorInfos[0], "ResizeBilinearQueueDescriptor", 4, "output"); - // Resize bilinear only changes width and height: batch and channel count must match + // Resizes bilinear only changes width and height: batch and channel count must match. { const unsigned int inputBatchSize = workloadInfo.m_InputTensorInfos[0].GetShape()[0]; const unsigned int outputBatchSize = workloadInfo.m_OutputTensorInfos[0].GetShape()[0]; @@ -747,4 +749,53 @@ void FloorQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const } } +void LstmQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const +{ + ValidateTensorNumDimensions(workloadInfo.m_InputTensorInfos[0], "LstmQueueDescriptor", 2, "input"); + ValidateTensorNumDimensions(workloadInfo.m_OutputTensorInfos[0], "LstmQueueDescriptor", 2, "output"); +} + +void ConvertFp32ToFp16QueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const +{ + ValidateSingleInput(workloadInfo, "ConvertFp32ToFp16QueueDescriptor"); + ValidateSingleOutput(workloadInfo, "ConvertFp32ToFp16QueueDescriptor"); + + if (workloadInfo.m_InputTensorInfos[0].GetDataType() != DataType::Float32) + { + throw InvalidArgumentException("ConvertFp32ToFp16QueueDescriptor: Input tensor type must be Float32."); + } + + if (workloadInfo.m_OutputTensorInfos[0].GetDataType() != DataType::Float16) + { + throw InvalidArgumentException("ConvertFp32ToFp16QueueDescriptor: Output tensor type must be Float16."); + } + + ValidateTensorShapesMatch(workloadInfo.m_InputTensorInfos[0], + workloadInfo.m_OutputTensorInfos[0], + "ConvertFp32ToFp16QueueDescriptor", + "input", + "output"); +} + +void ConvertFp16ToFp32QueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const +{ + ValidateSingleInput(workloadInfo, "ConvertFp16ToFp32QueueDescriptor"); + ValidateSingleOutput(workloadInfo, "ConvertFp16ToFp32QueueDescriptor"); + + if (workloadInfo.m_InputTensorInfos[0].GetDataType() != DataType::Float16) + { + throw InvalidArgumentException("ConvertFp16ToFp32QueueDescriptor: Input tensor type must be Float16."); + } + if (workloadInfo.m_OutputTensorInfos[0].GetDataType() != DataType::Float32) + { + throw InvalidArgumentException("ConvertFp16ToFp32QueueDescriptor: Output tensor type must be Float32."); + } + + ValidateTensorShapesMatch(workloadInfo.m_InputTensorInfos[0], + workloadInfo.m_OutputTensorInfos[0], + "ConvertFp16ToFp32QueueDescriptor", + "input", + "output"); +} + } //namespace armnn diff --git a/src/armnn/backends/WorkloadData.hpp b/src/armnn/backends/WorkloadData.hpp index 7f8713582f..db266e6df8 100644 --- a/src/armnn/backends/WorkloadData.hpp +++ b/src/armnn/backends/WorkloadData.hpp @@ -17,7 +17,7 @@ namespace armnn { -//a helper function that returns the bias data type required for given input data type. +//A helper function that returns the bias data type required for given input data type. DataType GetBiasDataType(DataType inputDataType); struct WorkloadInfo; @@ -38,7 +38,7 @@ protected: QueueDescriptor& operator=(QueueDescriptor const&) = default; }; -// Base class for queue descriptors which contain parameters +// Base class for queue descriptors which contain parameters. template struct QueueDescriptorWithParameters : public QueueDescriptor { @@ -59,13 +59,13 @@ struct MemCopyQueueDescriptor : QueueDescriptor using InputQueueDescriptor = MemCopyQueueDescriptor; using OutputQueueDescriptor = MemCopyQueueDescriptor; -// Softmax layer workload data +// Softmax layer workload data. struct SoftmaxQueueDescriptor : QueueDescriptorWithParameters { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Splitter layer workload data +// Splitter layer workload data. struct SplitterQueueDescriptor : QueueDescriptorWithParameters { struct ViewOrigin @@ -73,18 +73,18 @@ struct SplitterQueueDescriptor : QueueDescriptorWithParameters ViewOrigin() {} ViewOrigin(std::vector const& origin) : m_Origin(origin) {} - //view origin (size of the vector is the same as number of dimensions of the view) + //View origin (size of the vector is the same as number of dimensions of the view). std::vector m_Origin; }; - //view defines a tensor that will be carved from the input tensor. - //view origins are stored here, the extents are defined by sizes of the output tensors. + //View defines a tensor that will be carved from the input tensor. + //View origins are stored here, the extents are defined by sizes of the output tensors. std::vector m_ViewOrigins; void Validate(const WorkloadInfo& workloadInfo) const; }; -// Merger layer workload data +// Merger layer workload data. struct MergerQueueDescriptor : QueueDescriptorWithParameters { struct ViewOrigin @@ -92,24 +92,24 @@ struct MergerQueueDescriptor : QueueDescriptorWithParameters ViewOrigin() {} ViewOrigin(const std::vector& origin) : m_Origin(origin) {} - //view origin (size of the vector is the same as number of dimensions of the view) + //View origin (size of the vector is the same as number of dimensions of the view). std::vector m_Origin; }; - //view defines a sub-area of the output tensor that will be filled with the corresponding input tensor. - //view origins are stored here, the extents are defined by sizes of the input tensors. + //View defines a sub-area of the output tensor that will be filled with the corresponding input tensor. + //View origins are stored here, the extents are defined by sizes of the input tensors. std::vector m_ViewOrigins; void Validate(const WorkloadInfo& workloadInfo) const; }; -// Activation layer workload data +// Activation layer workload data. struct ActivationQueueDescriptor : QueueDescriptorWithParameters { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Fully connected layer workload data +// Fully connected layer workload data. struct FullyConnectedQueueDescriptor : QueueDescriptorWithParameters { FullyConnectedQueueDescriptor() @@ -124,19 +124,19 @@ struct FullyConnectedQueueDescriptor : QueueDescriptorWithParameters { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Pooling 2D layer workload data +// Pooling 2D layer workload data. struct Pooling2dQueueDescriptor : QueueDescriptorWithParameters { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Convolution 2D layer workload data +// Convolution 2D layer workload data. struct Convolution2dQueueDescriptor : QueueDescriptorWithParameters { Convolution2dQueueDescriptor() @@ -151,7 +151,7 @@ struct Convolution2dQueueDescriptor : QueueDescriptorWithParameters { DepthwiseConvolution2dQueueDescriptor() @@ -166,25 +166,25 @@ struct DepthwiseConvolution2dQueueDescriptor : QueueDescriptorWithParameters { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Add layer workload data +// Add layer workload data. struct AdditionQueueDescriptor : QueueDescriptor { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Multiplication layer workload data +// Multiplication layer workload data. struct MultiplicationQueueDescriptor : QueueDescriptor { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Batch norm layer workload data +// Batch norm layer workload data. struct BatchNormalizationQueueDescriptor : QueueDescriptorWithParameters { BatchNormalizationQueueDescriptor() @@ -249,4 +249,58 @@ struct FloorQueueDescriptor : QueueDescriptor void Validate(const WorkloadInfo& workloadInfo) const; }; +struct LstmQueueDescriptor : QueueDescriptorWithParameters +{ + LstmQueueDescriptor() + : m_InputToInputWeights(nullptr) + , m_InputToForgetWeights(nullptr) + , m_InputToCellWeights(nullptr) + , m_InputToOutputWeights(nullptr) + , m_RecurrentToInputWeights(nullptr) + , m_RecurrentToForgetWeights(nullptr) + , m_RecurrentToCellWeights(nullptr) + , m_RecurrentToOutputWeights(nullptr) + , m_CellToInputWeights(nullptr) + , m_CellToForgetWeights(nullptr) + , m_CellToOutputWeights(nullptr) + , m_InputGateBias(nullptr) + , m_ForgetGateBias(nullptr) + , m_CellBias(nullptr) + , m_OutputGateBias(nullptr) + , m_ProjectionWeights(nullptr) + , m_ProjectionBias(nullptr) + { + } + + const ConstCpuTensorHandle* m_InputToInputWeights; + const ConstCpuTensorHandle* m_InputToForgetWeights; + const ConstCpuTensorHandle* m_InputToCellWeights; + const ConstCpuTensorHandle* m_InputToOutputWeights; + const ConstCpuTensorHandle* m_RecurrentToInputWeights; + const ConstCpuTensorHandle* m_RecurrentToForgetWeights; + const ConstCpuTensorHandle* m_RecurrentToCellWeights; + const ConstCpuTensorHandle* m_RecurrentToOutputWeights; + const ConstCpuTensorHandle* m_CellToInputWeights; + const ConstCpuTensorHandle* m_CellToForgetWeights; + const ConstCpuTensorHandle* m_CellToOutputWeights; + const ConstCpuTensorHandle* m_InputGateBias; + const ConstCpuTensorHandle* m_ForgetGateBias; + const ConstCpuTensorHandle* m_CellBias; + const ConstCpuTensorHandle* m_OutputGateBias; + const ConstCpuTensorHandle* m_ProjectionWeights; + const ConstCpuTensorHandle* m_ProjectionBias; + + void Validate(const WorkloadInfo& workloadInfo) const; +}; + +struct ConvertFp16ToFp32QueueDescriptor : QueueDescriptor +{ + void Validate(const WorkloadInfo& workloadInfo) const; +}; + +struct ConvertFp32ToFp16QueueDescriptor : QueueDescriptor +{ + void Validate(const WorkloadInfo& workloadInfo) const; +}; + } //namespace armnn diff --git a/src/armnn/backends/WorkloadFactory.cpp b/src/armnn/backends/WorkloadFactory.cpp index 4e94d7701c..1b3f29421a 100644 --- a/src/armnn/backends/WorkloadFactory.cpp +++ b/src/armnn/backends/WorkloadFactory.cpp @@ -20,7 +20,40 @@ namespace armnn { -bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, DataType dataType, +namespace +{ + const TensorInfo OverrideDataType(const TensorInfo& info, boost::optional type) + { + if (type == boost::none) + { + return info; + } + + return TensorInfo(info.GetShape(), type.get(), info.GetQuantizationScale(), info.GetQuantizationOffset()); + } + + boost::optional GetBiasTypeFromWeightsType(boost::optional weightsType) + { + if (weightsType == boost::none) + { + return weightsType; + } + + switch(weightsType.get()) + { + case DataType::Float16: + case DataType::Float32: + return weightsType; + case DataType::QuantisedAsymm8: + return DataType::Signed32; + default: + BOOST_ASSERT_MSG(false, "GetBiasTypeFromWeightsType(): Unsupported data type."); + } + return boost::none; + } +} + +bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, boost::optional dataType, std::string& outReasonIfUnsupported) { constexpr size_t reasonCapacity = 1024; @@ -32,7 +65,13 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat { auto cLayer = boost::polymorphic_downcast(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsActivationSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsActivationSupported(compute, + OverrideDataType(input, dataType), + OverrideDataType(output, dataType), + cLayer->GetParameters(), + reason, + reasonCapacity); break; } case LayerType::Addition: @@ -40,30 +79,64 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat const TensorInfo& input0 = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& input1 = layer.GetInputSlot(1).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsAdditionSupported(compute, input0, input1, output, reason, reasonCapacity); + result = IsAdditionSupported(compute, + OverrideDataType(input0, dataType), + OverrideDataType(input1, dataType), + OverrideDataType(output, dataType), + reason, + reasonCapacity); break; } case LayerType::BatchNormalization: { auto cLayer = boost::polymorphic_downcast(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsBatchNormalizationSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + const TensorInfo& mean = cLayer->m_Mean->GetTensorInfo(); + const TensorInfo& var = cLayer->m_Variance->GetTensorInfo(); + const TensorInfo& beta = cLayer->m_Beta->GetTensorInfo(); + const TensorInfo& gamma = cLayer->m_Gamma->GetTensorInfo(); + result = IsBatchNormalizationSupported(compute, + OverrideDataType(input, dataType), + OverrideDataType(output, dataType), + OverrideDataType(mean, dataType), + OverrideDataType(var, dataType), + OverrideDataType(beta, dataType), + OverrideDataType(gamma, dataType), + cLayer->GetParameters(), + reason, reasonCapacity); break; } case LayerType::Constant: { const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsConstantSupported(compute, output, reason, reasonCapacity); + result = IsConstantSupported(compute, OverrideDataType(output, dataType), reason, reasonCapacity); break; } - case LayerType::Convolution2d: + case LayerType::ConvertFp16ToFp32: { - auto cLayer = boost::polymorphic_downcast(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsConvertFp16ToFp32Supported(compute, input, output, reason, reasonCapacity); + break; + } + case LayerType::ConvertFp32ToFp16: + { + const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsConvertFp32ToFp16Supported(compute, input, output, reason, reasonCapacity); + break; + } + case LayerType::Convolution2d: + { + auto cLayer = boost::polymorphic_downcast(&layer); + const TensorInfo input = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(), dataType); + const TensorInfo output = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType); BOOST_ASSERT(cLayer->m_Weight.get() != nullptr); - const TensorInfo * biasInfo = nullptr; + TensorInfo biasInfo; + const TensorInfo * biasInfoPtr = nullptr; + static const TensorInfo dummyFloat16Bias(TensorShape({1,1,1,1}), DataType::Float16); static const TensorInfo dummyFloat32Bias(TensorShape({1,1,1,1}), DataType::Float32); static const TensorInfo dummyQA8Bias(TensorShape({1,1,1,1}), DataType::Signed32); @@ -72,21 +145,27 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat if (descriptor.m_BiasEnabled) { BOOST_ASSERT(cLayer->m_Bias.get() != nullptr); - biasInfo = &(cLayer->m_Bias->GetTensorInfo()); + biasInfo = OverrideDataType(cLayer->m_Bias->GetTensorInfo(), GetBiasTypeFromWeightsType(dataType)); + biasInfoPtr = &biasInfo; } else { - // If biases are not enabled I pass a dummy tensorinfo for the validation + // If biases are not enabled pass a dummy tensorinfo for the validation. switch(input.GetDataType()) { + case DataType::Float16: + { + biasInfoPtr = &dummyFloat16Bias; + break; + } case DataType::Float32: { - biasInfo = &dummyFloat32Bias; + biasInfoPtr = &dummyFloat32Bias; break; } case DataType::QuantisedAsymm8: { - biasInfo = &dummyQA8Bias; + biasInfoPtr = &dummyQA8Bias; break; } default: @@ -100,16 +179,16 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat input, output, descriptor, - cLayer->m_Weight->GetTensorInfo(), - *biasInfo, + OverrideDataType(cLayer->m_Weight->GetTensorInfo(), dataType), + *biasInfoPtr, reason, reasonCapacity); break; } case LayerType::MemCopy: { - // MemCopy supported for CpuRef, CpuAcc and GpuAcc backends - // (also treat Undefined as CpuRef to avoid breaking lots of Unit tests) + // MemCopy supported for CpuRef, CpuAcc and GpuAcc backends, + // (also treat Undefined as CpuRef to avoid breaking lots of Unit tests). result = compute == Compute::CpuRef || compute == Compute::Undefined || compute == Compute::CpuAcc || compute == Compute::GpuAcc; strcpy(reason, "Unsupported backend type"); @@ -118,66 +197,314 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat case LayerType::DepthwiseConvolution2d: { auto cLayer = boost::polymorphic_downcast(&layer); - const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsDepthwiseConvolutionSupported(compute, input, cLayer->GetParameters(), - cLayer->m_Weight->GetTensorInfo(), reason, reasonCapacity); + const TensorInfo& input = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(), + dataType); + const TensorInfo& output = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType); + BOOST_ASSERT(cLayer->m_Weight.get() != nullptr); + + TensorInfo biasInfo; + const TensorInfo * biasInfoPtr = nullptr; + static const TensorInfo dummyFloat16Bias(TensorShape({1,1,1,1}), DataType::Float16); + static const TensorInfo dummyFloat32Bias(TensorShape({1,1,1,1}), DataType::Float32); + static const TensorInfo dummyQA8Bias(TensorShape({1,1,1,1}), DataType::Signed32); + + const DepthwiseConvolution2dDescriptor& descriptor = cLayer->GetParameters(); + if (descriptor.m_BiasEnabled) + { + BOOST_ASSERT(cLayer->m_Bias.get() != nullptr); + biasInfo = OverrideDataType(cLayer->m_Bias->GetTensorInfo(), GetBiasTypeFromWeightsType(dataType)); + biasInfoPtr = &biasInfo; + } + else + { + // If biases are not enabled pass a dummy tensorinfo for the validation + switch(input.GetDataType()) + { + case DataType::Float16: + { + biasInfoPtr = &dummyFloat16Bias; + break; + } + case DataType::Float32: + { + biasInfoPtr = &dummyFloat32Bias; + break; + } + case DataType::QuantisedAsymm8: + { + biasInfoPtr = &dummyQA8Bias; + break; + } + default: + { + BOOST_ASSERT_MSG(false, "Unexpected bias type"); + } + } + } + + + result = IsDepthwiseConvolutionSupported(compute, + input, + output, + descriptor, + OverrideDataType(cLayer->m_Weight->GetTensorInfo(), dataType), + *biasInfoPtr, + reason, + reasonCapacity); break; } case LayerType::FakeQuantization: { auto cLayer = boost::polymorphic_downcast(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsFakeQuantizationSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + result = IsFakeQuantizationSupported(compute, OverrideDataType(input, dataType), cLayer->GetParameters(), + reason, reasonCapacity); break; } case LayerType::Floor: { const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsFloorSupported(compute, input, output, reason, reasonCapacity); + result = IsFloorSupported(compute, OverrideDataType(input, dataType), OverrideDataType(output, dataType), + reason, reasonCapacity); break; } case LayerType::FullyConnected: { auto cLayer = boost::polymorphic_downcast(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsFullyConnectedSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + BOOST_ASSERT(cLayer->m_Weight.get() != nullptr); + + TensorInfo biasInfo; + const TensorInfo * biasInfoPtr = nullptr; + static const TensorInfo dummyFloat16Bias(TensorShape({1,1,1,1}), DataType::Float16); + static const TensorInfo dummyFloat32Bias(TensorShape({1,1,1,1}), DataType::Float32); + static const TensorInfo dummyQA8Bias(TensorShape({1,1,1,1}), DataType::Signed32); + + const FullyConnectedDescriptor& descriptor = cLayer->GetParameters(); + if (descriptor.m_BiasEnabled) + { + BOOST_ASSERT(cLayer->m_Bias.get() != nullptr); + biasInfo = OverrideDataType(cLayer->m_Bias->GetTensorInfo(), GetBiasTypeFromWeightsType(dataType)); + biasInfoPtr = &biasInfo; + } + else + { + // If biases are not enabled pass a dummy tensorinfo for the validation + switch(input.GetDataType()) + { + case DataType::Float16: + { + biasInfoPtr = &dummyFloat16Bias; + break; + } + case DataType::Float32: + { + biasInfoPtr = &dummyFloat32Bias; + break; + } + case DataType::QuantisedAsymm8: + { + biasInfoPtr = &dummyQA8Bias; + break; + } + default: + { + BOOST_ASSERT_MSG(false, "Unexpected bias type"); + } + } + } + + result = IsFullyConnectedSupported(compute, + OverrideDataType(input, dataType), + OverrideDataType(output, dataType), + OverrideDataType(cLayer->m_Weight->GetTensorInfo(), dataType), + *biasInfoPtr, + descriptor, + reason, + reasonCapacity); break; } case LayerType::Input: { const TensorInfo& input = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsInputSupported(compute, input, reason, reasonCapacity); + result = IsInputSupported(compute, OverrideDataType(input, dataType), reason, reasonCapacity); break; } case LayerType::L2Normalization: { const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsL2NormalizationSupported(compute, input, reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsL2NormalizationSupported(compute, OverrideDataType(input, dataType), + OverrideDataType(output, dataType), reason, reasonCapacity); + break; + } + case LayerType::Lstm: + { + auto cLayer = boost::polymorphic_downcast(&layer); + const LstmDescriptor& descriptor = cLayer->GetParameters(); + + // All inputs. + const TensorInfo& input = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(), + dataType); + const TensorInfo& outputStateIn = OverrideDataType(layer.GetInputSlot(1).GetConnection()->GetTensorInfo(), + dataType); + const TensorInfo& cellStateIn = OverrideDataType(layer.GetInputSlot(2).GetConnection()->GetTensorInfo(), + dataType); + // All outputs + const TensorInfo& scratchBuffer = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType); + const TensorInfo& outputStateOut = OverrideDataType(layer.GetOutputSlot(1).GetTensorInfo(), dataType); + const TensorInfo& cellStateOut = OverrideDataType(layer.GetOutputSlot(2).GetTensorInfo(), dataType); + const TensorInfo& output = OverrideDataType(layer.GetOutputSlot(3).GetTensorInfo(), dataType); + + // Basic parameters + const TensorInfo& inputToForgetWeights + = OverrideDataType(cLayer->m_BasicParameters.m_InputToForgetWeights->GetTensorInfo(), dataType); + const TensorInfo& inputToCellWeights + = OverrideDataType(cLayer->m_BasicParameters.m_InputToCellWeights->GetTensorInfo(), dataType); + const TensorInfo& inputToOutputWeights + = OverrideDataType(cLayer->m_BasicParameters.m_InputToOutputWeights->GetTensorInfo(), dataType); + const TensorInfo& recurrentToForgetWeights + = OverrideDataType(cLayer->m_BasicParameters.m_RecurrentToForgetWeights->GetTensorInfo(), dataType); + const TensorInfo& recurrentToCellWeights + = OverrideDataType(cLayer->m_BasicParameters.m_RecurrentToCellWeights->GetTensorInfo(), dataType); + const TensorInfo& recurrentToOutputWeights + = OverrideDataType(cLayer->m_BasicParameters.m_RecurrentToOutputWeights->GetTensorInfo(), dataType); + const TensorInfo& forgetGateBias + = OverrideDataType(cLayer->m_BasicParameters.m_ForgetGateBias->GetTensorInfo(), dataType); + const TensorInfo& cellBias + = OverrideDataType(cLayer->m_BasicParameters.m_CellBias->GetTensorInfo(), dataType); + const TensorInfo& outputGateBias + = OverrideDataType(cLayer->m_BasicParameters.m_OutputGateBias->GetTensorInfo(), dataType); + + // Optional parameters + const TensorInfo* inputToInputWeights = nullptr; + const TensorInfo* recurrentToInputWeights = nullptr; + const TensorInfo* cellToInputWeights = nullptr; + const TensorInfo* inputGateBias = nullptr; + const TensorInfo* projectionWeights = nullptr; + const TensorInfo* projectionBias = nullptr; + const TensorInfo* cellToForgetWeights = nullptr; + const TensorInfo* cellToOutputWeights = nullptr; + + TensorInfo optInputToInputWeights; + TensorInfo optRecurrentToInputWeights; + TensorInfo optCellToInputWeights; + TensorInfo optInputGateBias; + TensorInfo optProjectionWeights; + TensorInfo optProjectionBias; + TensorInfo optCellToForgetWeights; + TensorInfo optCellToOutputWeights; + + if(!descriptor.m_CifgEnabled) + { + optInputToInputWeights = + OverrideDataType(cLayer->m_CifgParameters.m_InputToInputWeights->GetTensorInfo(), dataType); + inputToInputWeights = &optInputToInputWeights; + + optRecurrentToInputWeights = + OverrideDataType(cLayer->m_CifgParameters.m_RecurrentToInputWeights->GetTensorInfo(), dataType); + recurrentToInputWeights = &optRecurrentToInputWeights; + if (cLayer->m_CifgParameters.m_CellToInputWeights != nullptr) + { + optCellToInputWeights = + OverrideDataType(cLayer->m_CifgParameters.m_CellToInputWeights->GetTensorInfo(), dataType); + cellToInputWeights = &optCellToInputWeights; + } + optInputGateBias = + OverrideDataType(cLayer->m_CifgParameters.m_InputGateBias->GetTensorInfo(), dataType); + inputGateBias = &optInputGateBias; + } + + if(descriptor.m_ProjectionEnabled) + { + optProjectionWeights = + OverrideDataType(cLayer->m_ProjectionParameters.m_ProjectionWeights->GetTensorInfo(), dataType); + projectionWeights = &optProjectionWeights; + if (cLayer->m_ProjectionParameters.m_ProjectionBias != nullptr) + { + optProjectionBias = + OverrideDataType(cLayer->m_ProjectionParameters.m_ProjectionBias->GetTensorInfo(), dataType); + projectionBias = &optProjectionBias; + } + } + + if(descriptor.m_PeepholeEnabled) + { + optCellToForgetWeights = + OverrideDataType(cLayer->m_PeepholeParameters.m_CellToForgetWeights->GetTensorInfo(), dataType); + cellToForgetWeights = &optCellToForgetWeights; + optCellToOutputWeights = + OverrideDataType(cLayer->m_PeepholeParameters.m_CellToOutputWeights->GetTensorInfo(), dataType); + cellToOutputWeights = &optCellToOutputWeights; + } + + result = IsLstmSupported(compute, + input, + outputStateIn, + cellStateIn, + scratchBuffer, + outputStateOut, + cellStateOut, + output, + descriptor, + inputToForgetWeights, + inputToCellWeights, + inputToOutputWeights, + recurrentToForgetWeights, + recurrentToCellWeights, + recurrentToOutputWeights, + forgetGateBias, + cellBias, + outputGateBias, + inputToInputWeights, + recurrentToInputWeights, + cellToInputWeights, + inputGateBias, + projectionWeights, + projectionBias, + cellToForgetWeights, + cellToOutputWeights, + reason, + reasonCapacity); break; } case LayerType::Merger: { auto cLayer = boost::polymorphic_downcast(&layer); - // Get vector of all inputs - auto getTensorInfo = [](const InputSlot& slot) + // Get vector of all inputs. + auto getTensorInfo = [&dataType](const InputSlot& slot) { - return &slot.GetConnectedOutputSlot()->GetTensorInfo(); + return OverrideDataType(slot.GetConnectedOutputSlot()->GetTensorInfo(), dataType); }; - auto begin = boost::make_transform_iterator(layer.GetInputSlots().begin(), getTensorInfo); - auto end = boost::make_transform_iterator(layer.GetInputSlots().end(), getTensorInfo); + auto beginI = boost::make_transform_iterator(layer.GetInputSlots().begin(), getTensorInfo); + auto endI = boost::make_transform_iterator(layer.GetInputSlots().end(), getTensorInfo); + std::vector inputs(beginI, endI); - std::vector inputs(begin, end); + auto getTensorInfoPtr = [](const TensorInfo& info) + { + return &info; + }; + auto beginPtr = boost::make_transform_iterator(inputs.begin(), getTensorInfoPtr); + auto endPtr = boost::make_transform_iterator(inputs.end(), getTensorInfoPtr); + std::vector inputPtrs(beginPtr, endPtr); - result = IsMergerSupported(compute, inputs, cLayer->GetParameters(), reason, reasonCapacity); + result = IsMergerSupported(compute, inputPtrs, cLayer->GetParameters(), reason, reasonCapacity); break; } case LayerType::Multiplication: { const TensorInfo& input0 = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& input1 = layer.GetInputSlot(1).GetConnection()->GetTensorInfo(); - result = IsMultiplicationSupported(compute, input0, input1, reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsMultiplicationSupported(compute, + OverrideDataType(input0, dataType), + OverrideDataType(input1, dataType), + OverrideDataType(output, dataType), + reason, + reasonCapacity); break; } case LayerType::Normalization: @@ -185,13 +512,15 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat auto cLayer = boost::polymorphic_downcast(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsNormalizationSupported(compute, input, output, cLayer->GetParameters(), reason, reasonCapacity); + result = IsNormalizationSupported(compute, OverrideDataType(input, dataType), + OverrideDataType(output, dataType), cLayer->GetParameters(), reason, + reasonCapacity); break; } case LayerType::Output: { const TensorInfo& output = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsOutputSupported(compute, output, reason, reasonCapacity); + result = IsOutputSupported(compute, OverrideDataType(output, dataType), reason, reasonCapacity); break; } case LayerType::Permute: @@ -199,7 +528,8 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat auto cLayer = boost::polymorphic_downcast(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsPermuteSupported(compute, input, output, cLayer->GetParameters(), reason, reasonCapacity); + result = IsPermuteSupported(compute, OverrideDataType(input, dataType), OverrideDataType(output, dataType), + cLayer->GetParameters(), reason, reasonCapacity); break; } case LayerType::Pooling2d: @@ -207,33 +537,38 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat auto cLayer = boost::polymorphic_downcast(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsPooling2dSupported(compute, input, output, cLayer->GetParameters(), reason, reasonCapacity); + result = IsPooling2dSupported(compute, OverrideDataType(input, dataType), + OverrideDataType(output, dataType), cLayer->GetParameters(), reason, + reasonCapacity); break; } case LayerType::Reshape: { const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsReshapeSupported(compute, input, reason, reasonCapacity); + result = IsReshapeSupported(compute, OverrideDataType(input, dataType), reason, reasonCapacity); break; } case LayerType::ResizeBilinear: { const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsResizeBilinearSupported(compute, input, reason, reasonCapacity); + result = IsResizeBilinearSupported(compute, OverrideDataType(input, dataType), reason, reasonCapacity); break; } case LayerType::Softmax: { auto cLayer = boost::polymorphic_downcast(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsSoftmaxSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsSoftmaxSupported(compute, OverrideDataType(input, dataType), OverrideDataType(output, dataType), + cLayer->GetParameters(), reason, reasonCapacity); break; } case LayerType::Splitter: { auto cLayer = boost::polymorphic_downcast(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsSplitterSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + result = IsSplitterSupported(compute, OverrideDataType(input, dataType), cLayer->GetParameters(), reason, + reasonCapacity); break; } default: @@ -248,7 +583,8 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat return result; } -bool IWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported) +bool IWorkloadFactory::IsLayerSupported(const Layer& layer, boost::optional dataType, + std::string& outReasonIfUnsupported) { return IsLayerSupported(layer.GetComputeDevice(), layer, dataType, outReasonIfUnsupported); } diff --git a/src/armnn/backends/WorkloadFactory.hpp b/src/armnn/backends/WorkloadFactory.hpp index 5791c1b46f..c211a290b3 100644 --- a/src/armnn/backends/WorkloadFactory.hpp +++ b/src/armnn/backends/WorkloadFactory.hpp @@ -8,13 +8,14 @@ #include #include "armnn/TensorFwd.hpp" #include "OutputHandler.hpp" +#include namespace armnn { class Layer; -// Workload factory interface for compute backends +// Workload factory interface for compute backends. class IWorkloadFactory { public: @@ -25,9 +26,16 @@ public: /// Informs the memory manager that the network is finalized and ready for execution. virtual void Finalize() { } - static bool IsLayerSupported(Compute compute, const Layer& layer, DataType dataType, + /// Inform the memory manager to release the memory + virtual void Release() { } + + /// Inform the memory manager to acquire memory + virtual void Acquire() { } + + static bool IsLayerSupported(Compute compute, const Layer& layer, boost::optional dataType, + std::string& outReasonIfUnsupported); + static bool IsLayerSupported(const Layer& layer, boost::optional dataType, std::string& outReasonIfUnsupported); - static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported); virtual bool SupportsSubTensors() const = 0; @@ -103,6 +111,15 @@ public: virtual std::unique_ptr CreateFloor(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) const = 0; + + virtual std::unique_ptr CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const = 0; + + virtual std::unique_ptr CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const = 0; + + virtual std::unique_ptr CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const = 0; }; } //namespace armnn diff --git a/src/armnn/backends/WorkloadUtils.hpp b/src/armnn/backends/WorkloadUtils.hpp new file mode 100644 index 0000000000..f21c78558e --- /dev/null +++ b/src/armnn/backends/WorkloadUtils.hpp @@ -0,0 +1,139 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "armnn/Tensor.hpp" +#include "ITensorHandle.hpp" + +#include + +namespace armnn +{ +namespace +{ +template +void AssignValues(unsigned int num, unsigned int& idx, const ArrayType& array, Arg& arg) +{ + if (idx >= num) + { + return; + } + + arg = array[(num - 1) - idx]; + idx++; +}; + +template +void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args& ... args) +{ + AssignValues(num, idx, array, assignee); + + AssignValues(num, idx, array, args...); +} +} // namespace + +template +void CopyTensorContentsGeneric(const ITensorHandle* srcTensor, ITensorHandle* dstTensor, CopyFunc copy) +{ + static_assert(MaxNumOfTensorDimensions == 4, "Please update CopyTensorContents"); + + TensorShape srcStrides = srcTensor->GetStrides(); + const TensorShape& srcShape = srcTensor->GetShape(); + TensorShape dstStrides = dstTensor->GetStrides(); + const TensorShape& dstShape = dstTensor->GetShape(); + + size_t srcBatches = 1; + size_t srcChannels = 1; + size_t srcHeight = 1; + size_t srcWidth = 1; + AssignValues(srcShape.GetNumDimensions(),0, srcShape, + srcWidth, + srcHeight, + srcChannels, + srcBatches); + + size_t srcBatchStride = 0; + size_t srcChannelStride = 0; + size_t srcHeightStride = 0; + size_t srcWidthStride = 0; + AssignValues(srcStrides.GetNumDimensions(),0, srcStrides, + srcWidthStride, + srcHeightStride, + srcChannelStride, + srcBatchStride); + + size_t dstBatches = 1; + size_t dstChannels = 1; + size_t dstHeight = 1; + size_t dstWidth = 1; + AssignValues(dstShape.GetNumDimensions(),0, dstShape, + dstWidth, + dstHeight, + dstChannels, + dstBatches); + + size_t dstBatchStride = 0; + size_t dstChannelStride = 0; + size_t dstHeightStride = 0; + size_t dstWidthStride = 0; + AssignValues(dstStrides.GetNumDimensions(),0, dstStrides, + dstWidthStride, + dstHeightStride, + dstChannelStride, + dstBatchStride); + + auto srcData = static_cast(srcTensor->Map()); + auto dstData = static_cast(dstTensor->Map()); + + size_t copyLength = std::min(srcWidth*srcWidthStride, dstWidth*dstWidthStride); + size_t copyHeight = std::min(srcHeight, dstHeight); + size_t copyChannels = std::min(srcChannels, dstChannels); + size_t copyBatches = std::min(srcBatches, dstBatches); + + for(unsigned int b=0; b < copyBatches; ++b) + { + auto srcPtrBatch = srcData; + auto dstPtrBatch = dstData; + for (unsigned int c=0; c< copyChannels; ++c) + { + auto srcPtrChannel = srcData; + auto dstPtrChannel = dstData; + for (unsigned int h=0; h < copyHeight; ++h) + { + copy(dstData, srcData, copyLength); + dstData += dstHeightStride; + srcData += srcHeightStride; + } + dstData += (static_cast(dstChannelStride) - (dstData - dstPtrChannel)); + srcData += (static_cast(srcChannelStride) - (srcData - srcPtrChannel)); + } + dstData += (static_cast(dstBatchStride)-(dstData - dstPtrBatch)); + srcData += (static_cast(srcBatchStride)-(srcData - srcPtrBatch)); + } + + srcTensor->Unmap(); + dstTensor->Unmap(); +} + +template +void GatherTensorHandlePairs(const DescriptorType& descriptor, + std::vector>& tensorHandlePairs) +{ + const unsigned int numInputs = static_cast(descriptor.m_Inputs.size()); + tensorHandlePairs.reserve(numInputs); + + for (unsigned int i = 0; i < numInputs; ++i) + { + SrcTensorHandleType* const srcTensorHandle = boost::polymorphic_downcast( + descriptor.m_Inputs[i]); + DstTensorHandleType* const dstTensorHandle = boost::polymorphic_downcast( + descriptor.m_Outputs[i]); + + tensorHandlePairs.emplace_back(srcTensorHandle, dstTensorHandle); + } +} + +} //namespace armnn \ No newline at end of file diff --git a/src/armnn/backends/test/ActivationFixture.hpp b/src/armnn/backends/test/ActivationFixture.hpp index a67a110354..69f3c8be05 100644 --- a/src/armnn/backends/test/ActivationFixture.hpp +++ b/src/armnn/backends/test/ActivationFixture.hpp @@ -41,7 +41,7 @@ struct ActivationFixture armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; - // parameters used by some of the activation functions + // Parameters used by some of the activation functions. float a = 0.234f; float b = -12.345f; }; diff --git a/src/armnn/backends/test/ActivationTestImpl.hpp b/src/armnn/backends/test/ActivationTestImpl.hpp index 255a00ef0b..e699b2289b 100644 --- a/src/armnn/backends/test/ActivationTestImpl.hpp +++ b/src/armnn/backends/test/ActivationTestImpl.hpp @@ -53,7 +53,7 @@ LayerTestResult BoundedReLuTestCommon(armnn::IWorkloadFactory& workloadFac std::unique_ptr inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); std::unique_ptr outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); - // Setup bounded ReLu + // Setup bounded ReLu. armnn::ActivationQueueDescriptor descriptor; armnn::WorkloadInfo workloadInfo; AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get()); @@ -94,7 +94,7 @@ LayerTestResult BoundedReLuUpperAndLowerBoundTest(armnn::IWorkloadFact 0.999f, 1.2f, 0.89f, 6.1f, }; - // Calculated manually + // Calculated manually. std::vector output = std::vector{ -1.0f, 0.1f, 0.5f, 1.0f, 0.786f, 0.9875f, -1.0f, 0.384f, @@ -122,7 +122,7 @@ LayerTestResult BoundedReLuUpperBoundOnlyTest(armnn::IWorkloadFactory& 0.999f, 1.2f, 0.89f, 6.1f, }; - // Calculated manually + // Calculated manually. std::vector output = std::vector{ 0.0f, 0.1f, 0.5f, 6.0f, 0.786f, 5.9875f, 0.0f, 0.384f, @@ -147,7 +147,7 @@ LayerTestResult BoundedReLuUint8UpperBoundOnlyTest(armnn::IWorkloadF 251, 8, 92 }; - // Calculated manually + // Calculated manually. std::vector output = std::vector{ 0, 122, 0, 255, 0, 58 @@ -176,7 +176,7 @@ LayerTestResult BoundedReLuUint8UpperAndLowerBoundTest(armnn::IWorkl 251, 8, 92 }; - // Calculated manually + // Calculated manually. std::vector output = std::vector{ 51, 192, 32, 192, 32, 92 @@ -186,7 +186,7 @@ LayerTestResult BoundedReLuUint8UpperAndLowerBoundTest(armnn::IWorkl float inputScale = 0.0125f; return BoundedReLuTestCommon(workloadFactory, 1.0f, -1.0f, - inputScale, inputOffset, inputScale, inputOffset, // input/output scale & offset same + inputScale, inputOffset, inputScale, inputOffset, // Input/output scale & offset same. input, output, inputWidth, inputHeight, inputChannels, inputBatchSize); } @@ -229,13 +229,14 @@ boost::multi_array BoundedReLuRandomInputTest(armnn::IWorkloadFactory& boost::multi_array output(GetTensorShapeAsArray<4>(outputTensorInfo)); - // min/max random values passed to MakeRandomTensor are purposely outside of the ReLu range [lowerBound, upperBound] + // Min/max random values passed to MakeRandomTensor are purposely outside of the ReLu + // range [lowerBound, upperBound]. auto input = MakeRandomTensor(inputTensorInfo, 4605828, lowerBound - 5.0f, upperBound * 2.0f); std::unique_ptr inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); std::unique_ptr outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); - // Setup bounded ReLu + // Set up bounded ReLu. armnn::ActivationQueueDescriptor descriptor; armnn::WorkloadInfo workloadInfo; AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get()); @@ -308,7 +309,7 @@ LayerTestResult ConstantLinearActivationTestCommon(armnn::IWorkloadFactory& std::unique_ptr inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); std::unique_ptr outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); - // Do linear activation that should leave tensor unchanged + // Do linear activation that should leave the tensor unchanged. armnn::ActivationQueueDescriptor data; armnn::WorkloadInfo info; AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); @@ -329,7 +330,7 @@ LayerTestResult ConstantLinearActivationTestCommon(armnn::IWorkloadFactory& CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get()); - // Ensure output equals input + // Ensure output equals input. ret.outputExpected = input; return ret; @@ -386,7 +387,7 @@ LayerTestResult SimpleActivationTest(armnn::IWorkloadFactory& workloadFact std::unique_ptr inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); std::unique_ptr outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); - // Setup bounded ReLu + // Setup bounded ReLu. armnn::ActivationQueueDescriptor descriptor; armnn::WorkloadInfo workloadInfo; AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get()); @@ -407,7 +408,7 @@ LayerTestResult SimpleActivationTest(armnn::IWorkloadFactory& workloadFact CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get()); - // Calculated manually + // Calculated manually. result.outputExpected = MakeTensor(outputTensorInfo, QuantizedVector(qScale, qOffset, outputExpectedData)); return result; @@ -423,7 +424,7 @@ LayerTestResult SimpleSigmoidTestCommon(armnn::IWorkloadFactory& workloadF 1.0f, 2.0f, 3.0f, 4.0f }; - // Calculate output values for input + // Calculate output values for input. auto f = [](float value) { return 1.0f / (1.0f + std::exp(-value)); diff --git a/src/armnn/backends/test/ArmComputeCl.cpp b/src/armnn/backends/test/ArmComputeCl.cpp index ae42d03ee3..d0cb7243c3 100644 --- a/src/armnn/backends/test/ArmComputeCl.cpp +++ b/src/armnn/backends/test/ArmComputeCl.cpp @@ -3,7 +3,6 @@ // See LICENSE file in the project root for full license information. // #include - #include "test/TensorHelpers.hpp" #include "LayerTests.hpp" @@ -13,6 +12,7 @@ #include "backends/RefWorkloadFactory.hpp" #include "backends/ClLayerSupport.hpp" #include "ActivationFixture.hpp" +#include "ClContextControlFixture.hpp" #include #include @@ -21,7 +21,7 @@ #include "test/UnitTests.hpp" -BOOST_AUTO_TEST_SUITE(Compute_ArmComputeCl) +BOOST_FIXTURE_TEST_SUITE(Compute_ArmComputeCl, ClContextControlFixture) using FactoryType = armnn::ClWorkloadFactory; // ============================================================================ @@ -65,27 +65,24 @@ ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dDepthMul1Uint8, DepthwiseConv ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dAsymmetric, DepthwiseConvolution2dAsymmetricTest, true) ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dAsymmetric, DepthwiseConvolution2dAsymmetricTest, false) -// Splitter -BOOST_AUTO_TEST_CASE(SimpleSplitter) +// Softmax +BOOST_AUTO_TEST_CASE(Softmax4dSupport) { - armnn::ClWorkloadFactory workloadFactory; - auto testResult = SplitterTest(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } -} + const unsigned int numDimensions = 4u; + std::array dimensionSizes; + dimensionSizes.fill(1u); -BOOST_AUTO_TEST_CASE(SimpleSplitterUint8) -{ - armnn::ClWorkloadFactory workloadFactory; - auto testResult = SplitterUint8Test(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } + const armnn::TensorInfo inputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32); + const armnn::TensorInfo outputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32); + + // 4D Softmax should be reported as unsupported on the CL backend + BOOST_TEST(!armnn::IsSoftmaxSupportedCl(inputInfo, outputInfo, armnn::SoftmaxDescriptor())); } +// Splitter +ARMNN_AUTO_TEST_CASE(SimpleSplitter, SplitterTest) +ARMNN_AUTO_TEST_CASE(SimpleSplitterUint8, SplitterUint8Test) + ARMNN_AUTO_TEST_CASE(CopyViaSplitter, CopyViaSplitterTest) ARMNN_AUTO_TEST_CASE(CopyViaSplitterUint8, CopyViaSplitterUint8Test) @@ -209,6 +206,19 @@ ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet1, PermuteFloat32ValueSet1Test) ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet2, PermuteFloat32ValueSet2Test) ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet3, PermuteFloat32ValueSet3Test) +// Lstm +ARMNN_AUTO_TEST_CASE(LstmLayerFloat32WithCifgWithPeepholeNoProjection, + LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest) +ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgNoPeepholeNoProjection, + LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest) +ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgWithPeepholeWithProjection, + LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest) + +// Convert from Float16 to Float32 +ARMNN_AUTO_TEST_CASE(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Test) +// Convert from Float32 to Float16 +ARMNN_AUTO_TEST_CASE(SimpleConvertFp32ToFp16, SimpleConvertFp32ToFp16Test) + // ============================================================================ // COMPARE tests diff --git a/src/armnn/backends/test/ArmComputeNeon.cpp b/src/armnn/backends/test/ArmComputeNeon.cpp index 0a78b75e2e..12947ca77a 100644 --- a/src/armnn/backends/test/ArmComputeNeon.cpp +++ b/src/armnn/backends/test/ArmComputeNeon.cpp @@ -54,7 +54,7 @@ armnn::Convolution2dDescriptor MakeConv2dDesc(uint32_t strideX, uint32_t strideY BOOST_AUTO_TEST_CASE(Conv2dUtils) { - // the only preferred Neon convolution is 1x1 with padding=0 and stride size {1,2,3} + // The only preferred Neon convolution is 1x1 with padding=0 and stride size {1,2,3}. armnn::TensorShape shape1x1({ 1,1,1,1 }); armnn::TensorInfo info1x1(shape1x1, armnn::DataType::Float32); BOOST_TEST(armnn::IsNeonDirectConvolutionPreferred(info1x1, MakeConv2dDesc(1, 1))); @@ -98,49 +98,133 @@ armnn::DepthwiseConvolution2dDescriptor MakeDepthwiseConv2dDesc(uint32_t strideX uint32_t depthMultiplier = 1, uint32_t padLeft = 0, uint32_t padRight = 0, uint32_t padTop = 0, uint32_t padBottom = 0) { + boost::ignore_unused(depthMultiplier); + armnn::DepthwiseConvolution2dDescriptor desc; + desc.m_PadLeft = padLeft; desc.m_PadRight = padRight; + desc.m_PadTop = padTop; desc.m_PadBottom = padBottom; desc.m_StrideX = strideX; desc.m_StrideY = strideY; - desc.m_BiasEnabled = true; + desc.m_BiasEnabled = false; + return desc; } +armnn::TensorInfo CreateOutputTensorInfo(const armnn::TensorInfo& inputInfo, + const armnn::TensorInfo& weightsInfo, + const armnn::DepthwiseConvolution2dDescriptor& descriptor, + armnn::DataType dataType) +{ + const armnn::TensorShape& inputShape = inputInfo.GetShape(); + const armnn::TensorShape& filterShape = weightsInfo.GetShape(); + + unsigned int inWidth = inputShape[3]; + unsigned int inHeight = inputShape[2]; + unsigned int inBatchSize = inputShape[0]; + + unsigned int filterWidth = filterShape[3]; + unsigned int readWidth = (inWidth + descriptor.m_PadLeft + descriptor.m_PadRight) - (filterWidth); + unsigned int outWidth = 1u + (readWidth / descriptor.m_StrideX); + + unsigned int filterHeight = filterShape[2]; + unsigned int readHeight = (inHeight + descriptor.m_PadTop + descriptor.m_PadBottom) - (filterHeight); + unsigned int outHeight = 1u + (readHeight / descriptor.m_StrideY); + unsigned int depthMultiplier = filterShape[0]; + + unsigned int outChannels = filterShape[1] * depthMultiplier; + unsigned int outBatchSize = inBatchSize; + + armnn::TensorShape outputShape({outBatchSize, outChannels, outHeight, outWidth}); + return armnn::TensorInfo(outputShape, dataType); +} } BOOST_AUTO_TEST_CASE(DepthwiseConv2dUtils) { - armnn::TensorInfo inputInfo({ 1, 1, 10, 10 }, armnn::DataType::Float32); - armnn::TensorInfo weightsInfo3x3({ 1, 1, 3, 3 }, armnn::DataType::Float32); + const armnn::DataType dataType = armnn::DataType::Float32; + + armnn::TensorInfo inputInfo({1, 1, 10, 10 }, dataType); + armnn::TensorInfo outputInfo; + armnn::TensorInfo weightsInfo3x3({ 1, 1, 3, 3 }, dataType); + armnn::TensorInfo biasesInfo; + + armnn::DepthwiseConvolution2dDescriptor descriptor; // Strides supported: 1,2,3 - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 2), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 3), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(2, 1), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(2, 2), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(2, 3), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(3, 1), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(3, 2), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(3, 3), weightsInfo3x3)); - - // Unsupported stride - BOOST_TEST(!armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(4, 1), weightsInfo3x3)); + descriptor = MakeDepthwiseConv2dDesc(1, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(1, 2); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(1, 3); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(2, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(2, 2); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(2, 3); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(3, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(3, 2); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(3, 3); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + // Supported stride 4 + descriptor = MakeDepthwiseConv2dDesc(4, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); // Supported weights shape 1x1 armnn::TensorInfo weightsInfo1x1({ 1, 1, 1, 1 }, armnn::DataType::Float32); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1), weightsInfo1x1)); + descriptor = MakeDepthwiseConv2dDesc(1, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo1x1, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo1x1, biasesInfo)); // Supported shape 2x2 armnn::TensorInfo weightsInfo2x2({ 1, 1, 2, 2 }, armnn::DataType::Float32); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1), weightsInfo2x2)); + descriptor = MakeDepthwiseConv2dDesc(1, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo2x2, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo2x2, biasesInfo)); // Asymmetric padding - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1, 1, 1, 2, 1, 2), - weightsInfo3x3)); + descriptor = MakeDepthwiseConv2dDesc(1, 1, 1, 1, 2, 1, 2); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); } // Pooling @@ -201,27 +285,24 @@ ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2Uint8, SimpleSoftmaxUint8Test, 2.0f) ARMNN_AUTO_TEST_CASE(ReLu1Uint8, BoundedReLuUint8UpperAndLowerBoundTest) ARMNN_AUTO_TEST_CASE(ReLu6Uint8, BoundedReLuUint8UpperBoundOnlyTest) -// Splitter -BOOST_AUTO_TEST_CASE(SimpleSplitter) +// Softmax +BOOST_AUTO_TEST_CASE(Softmax4dSupport) { - armnn::NeonWorkloadFactory workloadFactory; - auto testResult = SplitterTest(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } -} + const unsigned int numDimensions = 4u; + std::array dimensionSizes; + dimensionSizes.fill(1u); -BOOST_AUTO_TEST_CASE(SimpleSplitterUint8) -{ - armnn::NeonWorkloadFactory workloadFactory; - auto testResult = SplitterUint8Test(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } + const armnn::TensorInfo inputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32); + const armnn::TensorInfo outputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32); + + // 4D Softmax should be reported as unsupported on the NEON backend + BOOST_TEST(!armnn::IsSoftmaxSupportedNeon(inputInfo, outputInfo, armnn::SoftmaxDescriptor())); } +// Splitter +ARMNN_AUTO_TEST_CASE(SimpleSplitter, SplitterTest) +ARMNN_AUTO_TEST_CASE(SimpleSplitterUint8, SplitterUint8Test) + ARMNN_AUTO_TEST_CASE(CopyViaSplitter, CopyViaSplitterTest) ARMNN_AUTO_TEST_CASE(CopyViaSplitterUint8, CopyViaSplitterUint8Test) @@ -375,5 +456,4 @@ ARMNN_COMPARE_REF_FIXTURE_TEST_CASE(CompareSqrtActivationWithReference, Positive ARMNN_COMPARE_REF_FIXTURE_TEST_CASE(CompareSquareActivationWithReference, ActivationFixture, CompareActivationTest, armnn::ActivationFunction::Square, 5u) - BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/BatchNormTestImpl.hpp b/src/armnn/backends/test/BatchNormTestImpl.hpp index 861ef6b053..82e6e86747 100644 --- a/src/armnn/backends/test/BatchNormTestImpl.hpp +++ b/src/armnn/backends/test/BatchNormTestImpl.hpp @@ -52,7 +52,7 @@ LayerTestResult BatchNormTestImpl(armnn::IWorkloadFactory& workloadFactory, 4.f, 1.f, -2.f, 4.f })); - // these values are per-channel of the input + // These values are per-channel of the input. auto mean = MakeTensor(tensorInfo, QuantizedVector(qScale, qOffset, {3, -2})); auto variance = MakeTensor(tensorInfo, QuantizedVector(qScale, qOffset, {4, 9})); auto beta = MakeTensor(tensorInfo, QuantizedVector(qScale, qOffset, {3, 2})); @@ -82,8 +82,8 @@ LayerTestResult BatchNormTestImpl(armnn::IWorkloadFactory& workloadFactory, data.m_Gamma = &gammaTensor; data.m_Parameters.m_Eps = 0.0f; - // for each channel: - // substract mean, divide by standard deviation (with an epsilon to avoid div by 0) + // For each channel: + // substract mean, divide by standard deviation (with an epsilon to avoid div by 0), // multiply by gamma and add beta ret.outputExpected = MakeTensor(outputTensorInfo, QuantizedVector(qScale, qOffset, diff --git a/src/armnn/backends/test/ClContextControlFixture.hpp b/src/armnn/backends/test/ClContextControlFixture.hpp new file mode 100644 index 0000000000..13c061f818 --- /dev/null +++ b/src/armnn/backends/test/ClContextControlFixture.hpp @@ -0,0 +1,21 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClContextControl.hpp" + +template +struct ClContextControlFixtureBase +{ + // Initialising ClContextControl to ensure OpenCL is loaded correctly for each test case + ClContextControlFixtureBase() : m_ClContextControl(nullptr, ProfilingEnabled) {} + ~ClContextControlFixtureBase() {} + + armnn::ClContextControl m_ClContextControl; +}; + +using ClContextControlFixture = ClContextControlFixtureBase; +using ClProfilingContextControlFixture = ClContextControlFixtureBase; diff --git a/src/armnn/backends/test/Conv2dTestImpl.hpp b/src/armnn/backends/test/Conv2dTestImpl.hpp index 0c34beaa33..43297880f8 100644 --- a/src/armnn/backends/test/Conv2dTestImpl.hpp +++ b/src/armnn/backends/test/Conv2dTestImpl.hpp @@ -32,7 +32,7 @@ struct FullyConnectedBiasTypeForInputType using Type = int32_t; }; -// Modifies a std::vector in-place using a specified bias +// Modifies a std::vector in-place using a specified bias. template void ApplyBias(std::vector& v, float vScale, int32_t vOffset, const std::vector& bias, float bScale, int32_t bOffset, uint32_t w, uint32_t h) @@ -42,7 +42,7 @@ void ApplyBias(std::vector& v, float vScale, int32_t vOffset, BOOST_ASSERT_MSG((armnn::IsQuantizedType() && bScale != 0.0f) || (!armnn::IsQuantizedType()), "Invalid type and parameter combination."); - // Note we need to dequantize and re-quantize the image value and the bias + // Note we need to dequantize and re-quantize the image value and the bias. for (uint32_t i = 0; i < bias.size(); ++i) { float dBias = SelectiveDequantize(bias[i], bScale, bOffset); @@ -90,15 +90,15 @@ LayerTestResult SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl bool biasEnabled = bias.size() > 0; - // This function currently assumes 1 batch of input/output (and duplicates this into 2 batches) + // This function currently assumes 1 batch of input/output (and duplicates this into 2 batches). BOOST_ASSERT(inputNum == 1); BOOST_ASSERT(outputNum == 1); - // If a bias is used, its size must equal the number of output channels + // If a bias is used, its size must equal the number of output channels. BOOST_ASSERT(!biasEnabled || bias.size() == outputChannels); - // Note these tensors will use two (identical) batches + // Note these tensors will use two (identical) batches. armnn::TensorInfo inputTensorInfo({2*inputNum, inputChannels, inputHeight, inputWidth}, armnn::GetDataType()); armnn::TensorInfo outputTensorInfo({2*outputNum, outputChannels, outputHeight, outputWidth}, armnn::GetDataType()); @@ -120,7 +120,7 @@ LayerTestResult SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl LayerTestResult ret(outputTensorInfo); - // Construct input data - Two batches of the same input image + // Construct input data - two batches of the same input image. std::vector inputImage; inputImage.assign(input.data(), input.data() + 1*inputChannels*inputHeight*inputWidth); std::vector inputData; @@ -131,7 +131,7 @@ LayerTestResult SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl std::vector outputImage; outputImage.assign(outputExpected.data(), outputExpected.data() + outputChannels*outputHeight*outputWidth); - // Apply bias to output image if enabled + // Apply bias to output image if it is enabled. if(biasEnabled) { std::vector biasV; @@ -141,14 +141,14 @@ LayerTestResult SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl outputWidth, outputHeight); } - // Construct expected output data - two identical images + // Construct expected output data - two identical images. std::vector outputData; outputData.insert(outputData.end(), outputImage.begin(), outputImage.end()); outputData.insert(outputData.end(), outputImage.begin(), outputImage.end()); ret.outputExpected = MakeTensor(outputTensorInfo, outputData); - // todo: nontrivial padding and strides + // Todo: nontrivial padding and strides. uint32_t strideX = 1; uint32_t strideY = 1; @@ -171,7 +171,7 @@ LayerTestResult SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); data.m_Weight = &weightsTensor; - data.m_Bias = &biasTensor; // still set this whether or not bias is enabled - can be a source of bugs + data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - can be a source of bugs. data.m_Parameters.m_StrideX = strideX; data.m_Parameters.m_StrideY = strideY; data.m_Parameters.m_PadLeft = padLeft; @@ -222,11 +222,11 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl(armnn::IWorkloadF unsigned int outputHeight = boost::numeric_cast(outputExpected.shape()[2]); unsigned int outputWidth = boost::numeric_cast(outputExpected.shape()[3]); - // If a bias is used, its size must equal the number of output channels + // If a bias is used, its size must equal the number of output channels. bool biasEnabled = bias.size() > 0; BOOST_ASSERT(!biasEnabled || bias.size() == outputChannels); - // create the tensors + // Creates the tensors. armnn::TensorInfo inputTensorInfo({inputNum, inputChannels, inputHeight, inputWidth}, armnn::GetDataType()); armnn::TensorInfo outputTensorInfo({outputNum, outputChannels, outputHeight, outputWidth}, armnn::GetDataType()); @@ -246,12 +246,12 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl(armnn::IWorkloadF biasDesc.SetQuantizationOffset(0); } - // Construct the input data + // Construct the input data. std::vector inputData; inputData.assign(input.data(), input.data() + inputChannels*inputHeight*inputWidth); auto batchedInput = MakeTensor(inputTensorInfo, inputData); - // Construct the output data, with bias applied, as appropriate + // Construct the output data, with bias applied, as appropriate. std::vector outputData; outputData.assign(outputExpected.data(), outputExpected.data() + outputChannels*outputHeight*outputWidth); if (biasEnabled) @@ -280,7 +280,7 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestImpl(armnn::IWorkloadF armnn::DepthwiseConvolution2dQueueDescriptor data; data.m_Weight = &weightsTensor; - data.m_Bias = &biasTensor; // still set this whether or not bias is enabled - can be a source of bugs + data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - it can be a source of bugs. data.m_Parameters.m_StrideX = strideX; data.m_Parameters.m_StrideY = strideY; data.m_Parameters.m_PadLeft = padLeft; @@ -372,14 +372,14 @@ LayerTestResult DepthwiseConvolution2dDepthMul1TestImpl(armnn::IWorkloadFa -1.f, 0.f, -1.f, }))); - // manually calculated + // Manually calculated. std::vector outputImage( QuantizedVector(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), {0.f, 0.f}) ); - // Optionally apply bias to output image + // Optionally apply bias to output image. if(biasEnabled) { ApplyBias(outputImage, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), @@ -405,7 +405,7 @@ LayerTestResult DepthwiseConvolution2dDepthMul1TestImpl(armnn::IWorkloadFa AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); data.m_Weight = &weightsTensor; - data.m_Bias = &biasTensor; // still set this whether or not bias is enabled + data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled. data.m_Parameters.m_StrideX = 1; data.m_Parameters.m_StrideY = 1; data.m_Parameters.m_PadLeft = 0; @@ -520,7 +520,7 @@ LayerTestResult DepthwiseConvolution2dTestImpl(armnn::IWorkloadFactory& wo 0, 0, 0 }))); - // manually calculated + // Manually calculated. std::vector outputImage = std::vector( QuantizedVector(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), { 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, @@ -552,7 +552,7 @@ LayerTestResult DepthwiseConvolution2dTestImpl(armnn::IWorkloadFactory& wo 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f })); - // Optionally apply bias to output image + // Optionally apply bias to output image. if(biasEnabled) { ApplyBias(outputImage, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), @@ -578,7 +578,7 @@ LayerTestResult DepthwiseConvolution2dTestImpl(armnn::IWorkloadFactory& wo AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); data.m_Weight = &weightsTensor; - data.m_Bias = &biasTensor; // still set this whether or not bias is enabled + data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled. data.m_Parameters.m_StrideX = 2; data.m_Parameters.m_StrideY = 1; data.m_Parameters.m_PadLeft = 0; @@ -609,7 +609,7 @@ LayerTestResult Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact { using B = typename FullyConnectedBiasTypeForInputType::Type; - // until we have a specialist 1D convolution layer, we can fake one using + // Until we have a specialist 1D convolution layer, we can fake one using // 2D convolution with the final dimension set to 1. // I don't anticipate this being particularly slow, given that convolution is implemented // as a matrix multiplication, at which point dimension doesn't matter. @@ -617,11 +617,11 @@ LayerTestResult Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact unsigned int batchSize = 1; unsigned int inputChannels = 2; unsigned int outputChannels = 3; - unsigned int inputSize = 5; // the 1D size (could view as 'width' or 'height') + unsigned int inputSize = 5; // The 1D size (could view as 'width' or 'height'). unsigned int kernelSize = 3; unsigned int padSize = 2; unsigned int stride = 1; - unsigned int outputSize = 7; // (inputSize + 2 * padSize - kernelSize + 1) / stride + unsigned int outputSize = 7; // (inputSize + 2 * padSize - kernelSize + 1) / stride. armnn::TensorInfo inputInfo({batchSize, inputChannels, inputSize, 1}, armnn::GetDataType()); armnn::TensorInfo outputInfo({batchSize, outputChannels, outputSize, 1}, armnn::GetDataType()); @@ -671,7 +671,7 @@ LayerTestResult Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact 2.5f, -1.0f + 3.0f, 1.25f - 3.2f + 2.5f, -1.0f - 5.0f, 1.25f + 0.5f - 2.0f, -3.0f, 0.5f })); - // Optionally apply bias to output image + // Optionally apply bias to output image. if(biasEnabled) { ApplyBias(outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), @@ -712,7 +712,7 @@ LayerTestResult Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact workloadFactory.Finalize(); workload->Execute(); - // output + // Output LayerTestResult ret(outputInfo); CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get()); ret.outputExpected = MakeTensor(outputInfo, outputData); diff --git a/src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp b/src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp new file mode 100644 index 0000000000..89faaf9fe6 --- /dev/null +++ b/src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp @@ -0,0 +1,55 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include +#include +#include + +#include +#include + +#include + +#include + +LayerTestResult SimpleConvertFp16ToFp32Test(armnn::IWorkloadFactory& workloadFactory) +{ + using namespace half_float::literal; + + const armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16); + const armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32); + + auto input = MakeTensor(inputTensorInfo, + { -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h, + 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h }); + + LayerTestResult ret(outputTensorInfo); + ret.outputExpected = MakeTensor(outputTensorInfo, + { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f, + 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f }); + + std::unique_ptr inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + armnn::ConvertFp16ToFp32QueueDescriptor data; + armnn::WorkloadInfo info; + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + + std::unique_ptr workload = workloadFactory.CreateConvertFp16ToFp32(data, info); + + inputHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]); + + workload->Execute(); + + CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get()); + + return ret; +} diff --git a/src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp b/src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp new file mode 100644 index 0000000000..1d9bee577c --- /dev/null +++ b/src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp @@ -0,0 +1,55 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include +#include +#include + +#include +#include + +#include + +#include + +LayerTestResult SimpleConvertFp32ToFp16Test(armnn::IWorkloadFactory& workloadFactory) +{ + using namespace half_float::literal; + + const armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32); + const armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16); + + auto input = MakeTensor(inputTensorInfo, + { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f, + 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f }); + + LayerTestResult ret(outputTensorInfo); + ret.outputExpected = MakeTensor(outputTensorInfo, + { -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h, + 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h }); + + std::unique_ptr inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + armnn::ConvertFp32ToFp16QueueDescriptor data; + armnn::WorkloadInfo info; + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + + std::unique_ptr workload = workloadFactory.CreateConvertFp32ToFp16(data, info); + + inputHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]); + + workload->Execute(); + + CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get()); + + return ret; +} \ No newline at end of file diff --git a/src/armnn/backends/test/CreateWorkloadCl.cpp b/src/armnn/backends/test/CreateWorkloadCl.cpp index f83bb12bbe..5d4265911f 100644 --- a/src/armnn/backends/test/CreateWorkloadCl.cpp +++ b/src/armnn/backends/test/CreateWorkloadCl.cpp @@ -8,6 +8,7 @@ #include "backends/ClWorkloadUtils.hpp" #include "backends/ClWorkloads.hpp" #include "backends/ClTensorHandle.hpp" +#include "ClContextControlFixture.hpp" #include "test/CreateWorkloadClNeon.hpp" @@ -17,16 +18,17 @@ boost::test_tools::predicate_result CompareIClTensorHandleShape(IClTensorHandle* return CompareTensorHandleShape(tensorHandle, expectedDimensions); } -BOOST_AUTO_TEST_SUITE(CreateWorkloadCl) +BOOST_FIXTURE_TEST_SUITE(CreateWorkloadCl, ClContextControlFixture) -BOOST_AUTO_TEST_CASE(CreateActivationWorkload) +template +static void ClCreateActivationWorkloadTest() { Graph graph; ClWorkloadFactory factory; - auto workload = CreateActivationWorkloadTest(factory, graph); + auto workload = CreateActivationWorkloadTest(factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest). ActivationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); @@ -35,14 +37,24 @@ BOOST_AUTO_TEST_CASE(CreateActivationWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {1})); } -BOOST_AUTO_TEST_CASE(CreateAdditionWorkload) +BOOST_AUTO_TEST_CASE(CreateActivationFloat32Workload) +{ + ClCreateActivationWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateActivationFloat16Workload) +{ + ClCreateActivationWorkloadTest(); +} + +template +static void ClCreateAdditionWorkloadTest() { Graph graph; ClWorkloadFactory factory; + auto workload = CreateAdditionWorkloadTest(factory, graph); - auto workload = CreateAdditionWorkloadTest(factory, graph); - - // check that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest). AdditionQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle1 = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto inputHandle2 = boost::polymorphic_downcast(queueDescriptor.m_Inputs[1]); @@ -52,14 +64,26 @@ BOOST_AUTO_TEST_CASE(CreateAdditionWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {2, 3})); } -BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload) +BOOST_AUTO_TEST_CASE(CreateAdditionFloat32Workload) { - Graph graph; + ClCreateAdditionWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateAdditionFloat16Workload) +{ + ClCreateAdditionWorkloadTest(); +} + +template +static void ClCreateBatchNormalizationWorkloadTest() +{ + Graph graph; ClWorkloadFactory factory; - auto workload = CreateBatchNormalizationWorkloadTest(factory, graph); + auto workload = CreateBatchNormalizationWorkloadTest + (factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest). BatchNormalizationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); @@ -68,14 +92,57 @@ BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {2, 3, 1, 1})); } -template -static void Convolution2dWorkloadTest() +BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat32Workload) +{ + ClCreateBatchNormalizationWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat16Workload) +{ + ClCreateBatchNormalizationWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateConvertFp16ToFp32Workload) +{ + Graph graph; + ClWorkloadFactory factory; + auto workload = CreateConvertFp16ToFp32WorkloadTest(factory, graph); + + ConvertFp16ToFp32QueueDescriptor queueDescriptor = workload->GetData(); + auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); + auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); + + BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {3, 2, 3})); + BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 2, 3})); + BOOST_TEST((inputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F16)); + BOOST_TEST((outputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F32)); +} + +BOOST_AUTO_TEST_CASE(CreateConvertFp32ToFp16Workload) +{ + Graph graph; + ClWorkloadFactory factory; + auto workload = CreateConvertFp32ToFp16WorkloadTest(factory, graph); + + ConvertFp32ToFp16QueueDescriptor queueDescriptor = workload->GetData(); + auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); + auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); + + BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {3, 2, 3})); + BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 2, 3})); + BOOST_TEST((inputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F32)); + BOOST_TEST((outputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F16)); +} + +template +static void ClConvolution2dWorkloadTest() { - Graph graph; - ClWorkloadFactory factory; - auto workload = CreateConvolution2dWorkloadTest(factory, graph); + Graph graph; + ClWorkloadFactory factory; + auto workload = CreateConvolution2dWorkloadTest + (factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest). Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); @@ -85,18 +152,24 @@ static void Convolution2dWorkloadTest() BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat32Workload) { - Convolution2dWorkloadTest(); + ClConvolution2dWorkloadTest(); } +BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat16Workload) +{ + ClConvolution2dWorkloadTest(); +} -template -static void DirectConvolution2dWorkloadTest() + +template +static void ClDirectConvolution2dWorkloadTest() { - Graph graph; - ClWorkloadFactory factory; - auto workload = CreateDirectConvolution2dWorkloadTest(factory, graph); + Graph graph; + ClWorkloadFactory factory; + auto workload = CreateDirectConvolution2dWorkloadTest( + factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateDirectConvolution2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateDirectConvolution2dWorkloadTest). Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); @@ -106,22 +179,28 @@ static void DirectConvolution2dWorkloadTest() BOOST_AUTO_TEST_CASE(CreateDirectConvolution2dFloat32Workload) { - DirectConvolution2dWorkloadTest(); + ClDirectConvolution2dWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateDirectConvolution2dFloat16Workload) +{ + ClDirectConvolution2dWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateDirectConvolution2dUint8Workload) { - DirectConvolution2dWorkloadTest(); + ClDirectConvolution2dWorkloadTest(); } -BOOST_AUTO_TEST_CASE(CreateFullyConnectedWorkload) +template +static void ClCreateFullyConnectedWorkloadTest() { - Graph graph; + Graph graph; ClWorkloadFactory factory; - auto workload = - CreateFullyConnectedWorkloadTest(factory, graph); + auto workload = + CreateFullyConnectedWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest). FullyConnectedQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); @@ -129,15 +208,28 @@ BOOST_AUTO_TEST_CASE(CreateFullyConnectedWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 7})); } -BOOST_AUTO_TEST_CASE(CreateMultiplicationWorkload) + +BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat32WorkloadTest) { - Graph graph; + ClCreateFullyConnectedWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat16WorkloadTest) +{ + ClCreateFullyConnectedWorkloadTest(); +} + + +template +static void ClCreateMultiplicationWorkloadTest() +{ + Graph graph; ClWorkloadFactory factory; auto workload = - CreateMultiplicationWorkloadTest(factory, graph); + CreateMultiplicationWorkloadTest(factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest). MultiplicationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle1 = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto inputHandle2 = boost::polymorphic_downcast(queueDescriptor.m_Inputs[1]); @@ -147,14 +239,26 @@ BOOST_AUTO_TEST_CASE(CreateMultiplicationWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {2, 3})); } -BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload) +BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat32WorkloadTest) +{ + ClCreateMultiplicationWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat16WorkloadTest) +{ + ClCreateMultiplicationWorkloadTest(); +} + +template +static void ClNormalizationWorkloadTest() { - Graph graph; + Graph graph; ClWorkloadFactory factory; - auto workload = CreateNormalizationWorkloadTest(factory, graph); + auto workload = CreateNormalizationWorkloadTest + (factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest). NormalizationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); @@ -163,14 +267,25 @@ BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 5, 5, 1})); } -BOOST_AUTO_TEST_CASE(CreatePooling2dWorkload) +BOOST_AUTO_TEST_CASE(CreateNormalizationFloat32Workload) { - Graph graph; + ClNormalizationWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateNormalizationFloat16Workload) +{ + ClNormalizationWorkloadTest(); +} + +template +static void ClPooling2dWorkloadTest() +{ + Graph graph; ClWorkloadFactory factory; - auto workload = CreatePooling2dWorkloadTest(factory, graph); + auto workload = CreatePooling2dWorkloadTest(factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreatePooling2dWorkloadTest) + // Check that inputs/outputs are as we expect them (see definition of CreatePooling2dWorkloadTest). Pooling2dQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); @@ -179,18 +294,28 @@ BOOST_AUTO_TEST_CASE(CreatePooling2dWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 2, 2, 4})); } -template +BOOST_AUTO_TEST_CASE(CreatePooling2dFloat32Workload) +{ + ClPooling2dWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreatePooling2dFloat16Workload) +{ + ClPooling2dWorkloadTest(); +} + +template static void ClCreateReshapeWorkloadTest() { - Graph graph; + Graph graph; ClWorkloadFactory factory; - auto workload = CreateReshapeWorkloadTest(factory, graph); + auto workload = CreateReshapeWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest). ReshapeQueueDescriptor queueDescriptor = workload->GetData(); - auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); - auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); + auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); + auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {4, 1})); BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {4})); // Leading size 1 dimensions are collapsed by ACL. @@ -198,38 +323,56 @@ static void ClCreateReshapeWorkloadTest() BOOST_AUTO_TEST_CASE(CreateReshapeFloat32Workload) { - ClCreateReshapeWorkloadTest(); + ClCreateReshapeWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateReshapeFloat16Workload) +{ + ClCreateReshapeWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateReshapeUint8Workload) { - ClCreateReshapeWorkloadTest(); + ClCreateReshapeWorkloadTest(); } -BOOST_AUTO_TEST_CASE(CreateSoftmaxWorkload) +template +static void ClSoftmaxWorkloadTest() { - Graph graph; + Graph graph; ClWorkloadFactory factory; - auto workload = CreateSoftmaxWorkloadTest(factory, graph); + auto workload = CreateSoftmaxWorkloadTest(factory, graph); - // check that inputs/outputs are as we expect them (see definition of ClSoftmaxFloat32Workload) + // Checks that inputs/outputs are as we expect them (see definition of ClSoftmaxFloat32Workload). SoftmaxQueueDescriptor queueDescriptor = workload->GetData(); - auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); - auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); + auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); + auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {4, 1})); BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {4, 1})); } -BOOST_AUTO_TEST_CASE(CreateSplitterWorkload) + +BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat32WorkloadTest) +{ + ClSoftmaxWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat16WorkloadTest) +{ + ClSoftmaxWorkloadTest(); +} + +template +static void ClSplitterWorkloadTest() { Graph graph; ClWorkloadFactory factory; - auto workload = CreateSplitterWorkloadTest(factory, graph); + auto workload = CreateSplitterWorkloadTest(factory, graph); - // check that outputs are as we expect them (see definition of CreateSplitterWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest). SplitterQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {5, 7, 7})); @@ -242,14 +385,25 @@ BOOST_AUTO_TEST_CASE(CreateSplitterWorkload) auto outputHandle0 = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); // NOTE: At the moment the CL collapses the tensor to a 2 dim when dimension zero = 1 - // we are raising this difference between the NEON and CL libs as an issue with the compute library team + // we are raising this difference between the NEON and CL libs as an issue with the compute library team. BOOST_TEST(CompareIClTensorHandleShape(outputHandle0, {7, 7})); } -BOOST_AUTO_TEST_CASE(CreateSplitterMerger) +BOOST_AUTO_TEST_CASE(CreateSplitterFloat32Workload) +{ + ClSplitterWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreateSplitterFloat16Workload) { - // Test that it is possible to decide which output of the splitter layer - // should be lined to which input of the merger layer + ClSplitterWorkloadTest(); +} + +template +static void ClSplitterMergerTest() +{ + // Tests that it is possible to decide which output of the splitter layer + // should be lined to which input of the merger layer. // We test that is is possible to specify 0th output // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input // of the merger. @@ -258,12 +412,13 @@ BOOST_AUTO_TEST_CASE(CreateSplitterMerger) ClWorkloadFactory factory; auto workloads = - CreateSplitterMergerWorkloadTest(factory, graph); + CreateSplitterMergerWorkloadTest + (factory, graph); auto wlSplitter = std::move(workloads.first); auto wlMerger = std::move(workloads.second); - //check that the index of inputs/outputs matches what we declared on InputDescriptor construction. + //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction. armnn::ClSubTensorHandle* sOut0 = dynamic_cast(wlSplitter->GetData().m_Outputs[0]); armnn::ClSubTensorHandle* sOut1 = dynamic_cast(wlSplitter->GetData().m_Outputs[1]); armnn::ClSubTensorHandle* mIn0 = dynamic_cast(wlMerger->GetData().m_Inputs[0]); @@ -274,22 +429,33 @@ BOOST_AUTO_TEST_CASE(CreateSplitterMerger) BOOST_TEST(mIn0); BOOST_TEST(mIn1); - //fliped order of inputs/outputs + //Fliped order of inputs/outputs. bool validDataPointers = (sOut0 == mIn1) && (sOut1 == mIn0); BOOST_TEST(validDataPointers); - //also make sure that the inputs are subtensors of one tensor and outputs are sub tensors of another tensor + //Also make sure that the inputs are subtensors of one tensor and outputs are sub tensors of another tensor. bool validSubTensorParents = (mIn0->GetTensor().parent() == mIn1->GetTensor().parent()) && (sOut0->GetTensor().parent() == sOut1->GetTensor().parent()); BOOST_TEST(validSubTensorParents); } +BOOST_AUTO_TEST_CASE(CreateSplitterMergerFloat32Workload) +{ + ClSplitterMergerTest(); +} + +BOOST_AUTO_TEST_CASE(CreateSplitterMergerFloat16Workload) +{ + ClSplitterMergerTest(); +} + + BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) { // Test that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer. - // We create a splitter with two outputs. That each of those outputs is used by two different activation layers + // We create a splitter with two outputs. That each of those outputs is used by two different activation layers. Graph graph; ClWorkloadFactory factory; @@ -300,9 +466,10 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) std::unique_ptr wlActiv1_1; CreateSplitterMultipleInputsOneOutputWorkloadTest(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1); + ClActivationFloat32Workload, armnn::DataType::Float32>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, + wlActiv1_0, wlActiv1_1); - //check that the index of inputs/outputs matches what we declared on InputDescriptor construction. + //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction. armnn::ClSubTensorHandle* sOut0 = dynamic_cast(wlSplitter->GetData().m_Outputs[0]); armnn::ClSubTensorHandle* sOut1 = dynamic_cast(wlSplitter->GetData().m_Outputs[1]); armnn::ClSubTensorHandle* activ0_0Im = dynamic_cast(wlActiv0_0->GetData().m_Inputs[0]); @@ -327,17 +494,18 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) BOOST_AUTO_TEST_CASE(CreateMemCopyWorkloadsCl) { ClWorkloadFactory factory; - CreateMemCopyWorkloads(factory); + CreateMemCopyWorkloads(factory); } BOOST_AUTO_TEST_CASE(CreateL2NormalizationWorkload) { - Graph graph; + Graph graph; ClWorkloadFactory factory; - auto workload = CreateL2NormalizationWorkloadTest(factory, graph); + auto workload = CreateL2NormalizationWorkloadTest + (factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest). L2NormalizationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); @@ -346,4 +514,24 @@ BOOST_AUTO_TEST_CASE(CreateL2NormalizationWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, { 5, 20, 50, 67 })); } +template +static void ClCreateLstmWorkloadTest() +{ + Graph graph; + ClWorkloadFactory factory; + auto workload = CreateLstmWorkloadTest(factory, graph); + + LstmQueueDescriptor queueDescriptor = workload->GetData(); + auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); + auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[1]); + BOOST_TEST(CompareIClTensorHandleShape(inputHandle, { 2, 2 })); + BOOST_TEST(CompareIClTensorHandleShape(outputHandle, { 2, 4 })); +} + +BOOST_AUTO_TEST_CASE(CreateLSTMWorkloadFloat32Workload) +{ + ClCreateLstmWorkloadTest(); +} + + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/CreateWorkloadNeon.cpp b/src/armnn/backends/test/CreateWorkloadNeon.cpp index 4d91fbfd31..b2a444af74 100644 --- a/src/armnn/backends/test/CreateWorkloadNeon.cpp +++ b/src/armnn/backends/test/CreateWorkloadNeon.cpp @@ -50,168 +50,302 @@ bool TestNeonTensorHandleInfo(armnn::INeonTensorHandle* handle, const armnn::Ten } // namespace -BOOST_AUTO_TEST_CASE(CreateActivationWorkload) +template +static void NeonCreateActivationWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateActivationWorkloadTest(factory, graph); + auto workload = CreateActivationWorkloadTest + (factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest). ActivationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({1, 1}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 1}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({1, 1}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 1}, DataType))); } -BOOST_AUTO_TEST_CASE(CreateAdditionWorkload) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateActivationFloat16Workload) +{ + NeonCreateActivationWorkloadTest(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateActivationFloat32Workload) +{ + NeonCreateActivationWorkloadTest(); +} + +template +static void NeonCreateAdditionWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateAdditionWorkloadTest(factory, graph); + auto workload = CreateAdditionWorkloadTest(factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest). AdditionQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle1 = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto inputHandle2 = boost::polymorphic_downcast(queueDescriptor.m_Inputs[1]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType))); } -BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateAdditionFloat16Workload) +{ + NeonCreateAdditionWorkloadTest(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateAdditionFloat32Workload) +{ + NeonCreateAdditionWorkloadTest(); +} + +template +static void NeonCreateBatchNormalizationWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateBatchNormalizationWorkloadTest(factory, graph); + auto workload = CreateBatchNormalizationWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest). BatchNormalizationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 1, 1}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3, 1, 1}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 1, 1}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3, 1, 1}, DataType))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat16Workload) +{ + NeonCreateBatchNormalizationWorkloadTest(); } +#endif -BOOST_AUTO_TEST_CASE(CreateConvolution2dWorkload) +BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat32Workload) +{ + NeonCreateBatchNormalizationWorkloadTest(); +} + +template +static void NeonCreateConvolution2dWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateConvolution2dWorkloadTest(factory, graph); + auto workload = CreateConvolution2dWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest). Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 8, 16}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 2, 2, 10}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 8, 16}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 2, 2, 10}, DataType))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat16Workload) +{ + NeonCreateConvolution2dWorkloadTest(); } +#endif -BOOST_AUTO_TEST_CASE(CreateFullyConnectedWorkload) +BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat32Workload) +{ + NeonCreateConvolution2dWorkloadTest(); +} + +template +static void NeonCreateFullyConnectedWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateFullyConnectedWorkloadTest(factory, graph); + auto workload = CreateFullyConnectedWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest). FullyConnectedQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 1, 4, 5}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 7}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 1, 4, 5}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 7}, DataType))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat16Workload) +{ + NeonCreateFullyConnectedWorkloadTest(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat32Workload) +{ + NeonCreateFullyConnectedWorkloadTest(); } -BOOST_AUTO_TEST_CASE(CreateMultiplicationWorkload) +template +static void NeonCreateMultiplicationWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateMultiplicationWorkloadTest(factory, graph); + auto workload = CreateMultiplicationWorkloadTest(factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest). MultiplicationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle1 = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto inputHandle2 = boost::polymorphic_downcast(queueDescriptor.m_Inputs[1]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType))); } -BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat16Workload) +{ + NeonCreateMultiplicationWorkloadTest(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat32Workload) +{ + NeonCreateMultiplicationWorkloadTest(); +} + +template +static void NeonCreateNormalizationWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateNormalizationWorkloadTest(factory, graph); + auto workload = CreateNormalizationWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest). NormalizationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 5, 5, 1}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 5, 5, 1}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 5, 5, 1}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 5, 5, 1}, DataType))); } -BOOST_AUTO_TEST_CASE(CreatePooling2dWorkload) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateNormalizationFloat16Workload) +{ + NeonCreateNormalizationWorkloadTest(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateNormalizationFloat32Workload) +{ + NeonCreateNormalizationWorkloadTest(); +} + +template +static void NeonCreatePooling2dWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreatePooling2dWorkloadTest(factory, graph); + auto workload = CreatePooling2dWorkloadTest + (factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest). Pooling2dQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 2, 5, 5}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 2, 2, 4}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 2, 5, 5}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 2, 2, 4}, DataType))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreatePooling2dFloat16Workload) +{ + NeonCreatePooling2dWorkloadTest(); } +#endif -template -static void NeonCreateReshapeWorkloadTest(DataType dataType) +BOOST_AUTO_TEST_CASE(CreatePooling2dFloat32Workload) +{ + NeonCreatePooling2dWorkloadTest(); +} + +BOOST_AUTO_TEST_CASE(CreatePooling2dUint8Workload) +{ + NeonCreatePooling2dWorkloadTest(); +} + +template +static void NeonCreateReshapeWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateReshapeWorkloadTest(factory, graph); + auto workload = CreateReshapeWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest). ReshapeQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, dataType))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 4}, dataType))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 4}, DataType))); } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateReshapeFloat16Workload) +{ + NeonCreateReshapeWorkloadTest(); +} +#endif + BOOST_AUTO_TEST_CASE(CreateReshapeFloat32Workload) { - NeonCreateReshapeWorkloadTest(DataType::Float32); + NeonCreateReshapeWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateReshapeUint8Workload) { - NeonCreateReshapeWorkloadTest(DataType::QuantisedAsymm8); + NeonCreateReshapeWorkloadTest(); } -BOOST_AUTO_TEST_CASE(CreateSoftmaxWorkload) +template +static void NeonCreateSoftmaxWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateSoftmaxWorkloadTest(factory, graph); + auto workload = CreateSoftmaxWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest). SoftmaxQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({4, 1}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({4, 1}, DataType))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat16Workload) +{ + NeonCreateSoftmaxWorkloadTest(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat32Workload) +{ + NeonCreateSoftmaxWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateSplitterWorkload) { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateSplitterWorkloadTest(factory, graph); + auto workload = CreateSplitterWorkloadTest(factory, graph); - // check that outputs are as we expect them (see definition of CreateSplitterWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest). SplitterQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({5, 7, 7}, DataType::Float32))); @@ -228,22 +362,23 @@ BOOST_AUTO_TEST_CASE(CreateSplitterWorkload) BOOST_AUTO_TEST_CASE(CreateSplitterMerger) { - // Test that it is possible to decide which output of the splitter layer - // should be lined to which input of the merger layer - // We test that is is possible to specify 0th output - // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input + // Tests that it is possible to decide which output of the splitter layer + // should be lined to which input of the merger layer. + // We tested that is is possible to specify 0th output + // of the splitter to be the 1st input to the merger, and the 1st output of the splitter to be 0th input // of the merger. Graph graph; NeonWorkloadFactory factory; auto workloads = - CreateSplitterMergerWorkloadTest(factory, graph); + CreateSplitterMergerWorkloadTest(factory, graph); auto wlSplitter = std::move(workloads.first); auto wlMerger = std::move(workloads.second); - //check that the index of inputs/outputs matches what we declared on InputDescriptor construction. + //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction. armnn::INeonTensorHandle* sOut0 = dynamic_cast(wlSplitter->GetData().m_Outputs[0]); armnn::INeonTensorHandle* sOut1 = dynamic_cast(wlSplitter->GetData().m_Outputs[1]); armnn::INeonTensorHandle* mIn0 = dynamic_cast(wlMerger->GetData().m_Inputs[0]); @@ -261,8 +396,8 @@ BOOST_AUTO_TEST_CASE(CreateSplitterMerger) BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) { - // Test that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer. - // We create a splitter with two outputs. That each of those outputs is used by two different activation layers + // Tests that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer. + // We created a splitter with two outputs. That each of those outputs is used by two different activation layers Graph graph; NeonWorkloadFactory factory; @@ -273,7 +408,8 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) std::unique_ptr wlActiv1_1; CreateSplitterMultipleInputsOneOutputWorkloadTest(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1); + NeonActivationFloat32Workload, DataType::Float32>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, + wlActiv1_0, wlActiv1_1); armnn::INeonTensorHandle* sOut0 = dynamic_cast(wlSplitter->GetData().m_Outputs[0]); armnn::INeonTensorHandle* sOut1 = dynamic_cast(wlSplitter->GetData().m_Outputs[1]); @@ -299,7 +435,7 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) BOOST_AUTO_TEST_CASE(CreateMemCopyWorkloadsNeon) { NeonWorkloadFactory factory; - CreateMemCopyWorkloads(factory); + CreateMemCopyWorkloads(factory); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/CreateWorkloadRef.cpp b/src/armnn/backends/test/CreateWorkloadRef.cpp index abc46e4361..109156468a 100644 --- a/src/armnn/backends/test/CreateWorkloadRef.cpp +++ b/src/armnn/backends/test/CreateWorkloadRef.cpp @@ -39,71 +39,95 @@ void CheckInputsOutput(std::unique_ptr workload, BOOST_AUTO_TEST_SUITE(CreateWorkloadRef) -template +template static void RefCreateActivationWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateActivationWorkloadTest(factory, graph); + auto workload = CreateActivationWorkloadTest(factory, graph); - // check that outputs are as we expect them (see definition of CreateActivationWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateActivationWorkloadTest). CheckInputOutput(std::move(workload), - TensorInfo({ 1, 1 }, ActivationWorkloadType::ms_DataType), - TensorInfo({ 1, 1 }, ActivationWorkloadType::ms_DataType)); + TensorInfo({ 1, 1 }, DataType), + TensorInfo({ 1, 1 }, DataType)); } BOOST_AUTO_TEST_CASE(CreateActivationFloat32Workload) { - RefCreateActivationWorkloadTest(); + RefCreateActivationWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateActivationUint8Workload) { - RefCreateActivationWorkloadTest(); + RefCreateActivationWorkloadTest(); } -template +template static void RefCreateAdditionWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateAdditionWorkloadTest(factory, graph); + auto workload = CreateAdditionWorkloadTest(factory, graph); - // check that outputs are as we expect them (see definition of CreateAdditionWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateAdditionWorkloadTest). CheckInputsOutput(std::move(workload), - TensorInfo({ 2, 3 }, AdditionWorkloadType::ms_DataType), - TensorInfo({ 2, 3 }, AdditionWorkloadType::ms_DataType), - TensorInfo({ 2, 3 }, AdditionWorkloadType::ms_DataType)); + TensorInfo({ 2, 3 }, DataType), + TensorInfo({ 2, 3 }, DataType), + TensorInfo({ 2, 3 }, DataType)); } BOOST_AUTO_TEST_CASE(CreateAdditionFloatWorkload) { - RefCreateAdditionWorkloadTest(); + RefCreateAdditionWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateAdditionUint8Workload) { - RefCreateAdditionWorkloadTest(); + RefCreateAdditionWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload) { Graph graph; RefWorkloadFactory factory; - auto workload = CreateBatchNormalizationWorkloadTest(factory, graph); + auto workload = CreateBatchNormalizationWorkloadTest + (factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest). CheckInputOutput( std::move(workload), TensorInfo({2, 3, 1, 1}, DataType::Float32), TensorInfo({2, 3, 1, 1}, DataType::Float32)); } +BOOST_AUTO_TEST_CASE(CreateConvertFp16ToFp32Float32Workload) +{ + Graph graph; + RefWorkloadFactory factory; + auto workload = CreateConvertFp16ToFp32WorkloadTest(factory, graph); + + // Checks that outputs and inputs are as we expect them + CheckInputOutput( + std::move(workload), TensorInfo({1, 3, 2, 3}, DataType::Float16), TensorInfo({1, 3, 2, 3}, DataType::Float32)); +} + +BOOST_AUTO_TEST_CASE(CreateConvertFp32ToFp16Float16Workload) +{ + Graph graph; + RefWorkloadFactory factory; + auto workload = CreateConvertFp32ToFp16WorkloadTest(factory, graph); + + // Checks that outputs and inputs are as we expect them + CheckInputOutput( + std::move(workload), TensorInfo({1, 3, 2, 3}, DataType::Float32), TensorInfo({1, 3, 2, 3}, DataType::Float16)); +} + BOOST_AUTO_TEST_CASE(CreateConvolution2dWorkload) { Graph graph; RefWorkloadFactory factory; - auto workload = CreateConvolution2dWorkloadTest(factory, graph); + auto workload = CreateConvolution2dWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest). CheckInputOutput(std::move(workload), TensorInfo({2, 3, 8, 16}, DataType::Float32), TensorInfo({2, 2, 2, 10}, DataType::Float32)); @@ -116,170 +140,172 @@ BOOST_AUTO_TEST_CASE(CreateDepthwiseConvolution2dWorkload) auto workload = CreateDepthwiseConvolution2dWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest). CheckInputOutput(std::move(workload), TensorInfo({2, 3, 8, 16}, DataType::Float32), TensorInfo({2, 9, 2, 10}, DataType::Float32)); } -template +template static void RefCreateFullyConnectedWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateFullyConnectedWorkloadTest(factory, graph); + auto workload = CreateFullyConnectedWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest) - float inputsQScale = FullyConnectedWorkloadType::ms_DataType == DataType::QuantisedAsymm8 ? 1.0f : 0.0; - float outputQScale = FullyConnectedWorkloadType::ms_DataType == DataType::QuantisedAsymm8 ? 2.0f : 0.0; + // Checks that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest). + float inputsQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 1.0f : 0.0; + float outputQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 2.0f : 0.0; CheckInputOutput(std::move(workload), - TensorInfo({ 3, 1, 4, 5 }, FullyConnectedWorkloadType::ms_DataType, inputsQScale), - TensorInfo({ 3, 7 }, FullyConnectedWorkloadType::ms_DataType, outputQScale)); + TensorInfo({ 3, 1, 4, 5 }, DataType, inputsQScale), + TensorInfo({ 3, 7 }, DataType, outputQScale)); } BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat32Workload) { - RefCreateFullyConnectedWorkloadTest(); + RefCreateFullyConnectedWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateFullyConnectedUint8Workload) { - RefCreateFullyConnectedWorkloadTest(); + RefCreateFullyConnectedWorkloadTest(); } -template +template static void RefCreateMultiplicationWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateMultiplicationWorkloadTest(factory, graph); + auto workload = CreateMultiplicationWorkloadTest(factory, graph); - // check that outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest). CheckInputsOutput(std::move(workload), - TensorInfo({ 2, 3 }, MultiplicationWorkloadType::ms_DataType), - TensorInfo({ 2, 3 }, MultiplicationWorkloadType::ms_DataType), - TensorInfo({ 2, 3 }, MultiplicationWorkloadType::ms_DataType)); + TensorInfo({ 2, 3 }, DataType), + TensorInfo({ 2, 3 }, DataType), + TensorInfo({ 2, 3 }, DataType)); } BOOST_AUTO_TEST_CASE(CreateMultiplicationFloatWorkload) { - RefCreateMultiplicationWorkloadTest(); + RefCreateMultiplicationWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateMultiplicationUint8Workload) { - RefCreateMultiplicationWorkloadTest(); + RefCreateMultiplicationWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload) { Graph graph; RefWorkloadFactory factory; - auto workload = CreateNormalizationWorkloadTest(factory, graph); + auto workload = CreateNormalizationWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest). CheckInputOutput(std::move(workload), TensorInfo({3, 5, 5, 1}, DataType::Float32), TensorInfo({3, 5, 5, 1}, DataType::Float32)); } -template +template static void RefCreatePooling2dWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreatePooling2dWorkloadTest(factory, graph); + auto workload = CreatePooling2dWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest). CheckInputOutput( std::move(workload), - TensorInfo({3, 2, 5, 5}, Pooling2dWorkloadType::ms_DataType), - TensorInfo({3, 2, 2, 4}, Pooling2dWorkloadType::ms_DataType)); + TensorInfo({3, 2, 5, 5}, DataType), + TensorInfo({3, 2, 2, 4}, DataType)); } BOOST_AUTO_TEST_CASE(CreatePooling2dFloat32Workload) { - RefCreatePooling2dWorkloadTest(); + RefCreatePooling2dWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreatePooling2dUint8Workload) { - RefCreatePooling2dWorkloadTest(); + RefCreatePooling2dWorkloadTest(); } -template +template static void RefCreateSoftmaxWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateSoftmaxWorkloadTest(factory, graph); + auto workload = CreateSoftmaxWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest). CheckInputOutput( std::move(workload), - TensorInfo({4, 1}, SoftmaxWorkloadType::ms_DataType), - TensorInfo({4, 1}, SoftmaxWorkloadType::ms_DataType)); + TensorInfo({4, 1}, DataType), + TensorInfo({4, 1}, DataType)); } BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat32Workload) { - RefCreateSoftmaxWorkloadTest(); + RefCreateSoftmaxWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateSoftmaxUint8Workload) { - RefCreateSoftmaxWorkloadTest(); + RefCreateSoftmaxWorkloadTest(); } -template +template static void RefCreateSplitterWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateSplitterWorkloadTest(factory, graph); + auto workload = CreateSplitterWorkloadTest(factory, graph); - // check that outputs are as we expect them (see definition of CreateSplitterWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest). SplitterQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast(queueDescriptor.m_Inputs[0]); - BOOST_TEST((inputHandle->GetTensorInfo() == TensorInfo({ 5, 7, 7 }, SplitterWorkloadType::ms_DataType))); + BOOST_TEST((inputHandle->GetTensorInfo() == TensorInfo({ 5, 7, 7 }, DataType))); auto outputHandle0 = boost::polymorphic_downcast(queueDescriptor.m_Outputs[0]); - BOOST_TEST((outputHandle0->GetTensorInfo() == TensorInfo({ 1, 7, 7 }, SplitterWorkloadType::ms_DataType))); + BOOST_TEST((outputHandle0->GetTensorInfo() == TensorInfo({ 1, 7, 7 }, DataType))); auto outputHandle1 = boost::polymorphic_downcast(queueDescriptor.m_Outputs[1]); - BOOST_TEST((outputHandle1->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, SplitterWorkloadType::ms_DataType))); + BOOST_TEST((outputHandle1->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, DataType))); auto outputHandle2 = boost::polymorphic_downcast(queueDescriptor.m_Outputs[2]); - BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, SplitterWorkloadType::ms_DataType))); + BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, DataType))); } BOOST_AUTO_TEST_CASE(CreateSplitterFloat32Workload) { - RefCreateSplitterWorkloadTest(); + RefCreateSplitterWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateSplitterUint8Workload) { - RefCreateSplitterWorkloadTest(); + RefCreateSplitterWorkloadTest(); } -template +template static void RefCreateSplitterMergerWorkloadTest() { - // Test that it is possible to decide which output of the splitter layer - // should be lined to which input of the merger layer - // We test that is is possible to specify 0th output - // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input + // Tests that it is possible to decide which output of the splitter layer + // should be lined to which input of the merger layer. + // We tested that is is possible to specify 0th output + // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input // of the merger. Graph graph; RefWorkloadFactory factory; - auto workloads = CreateSplitterMergerWorkloadTest(factory, graph); + auto workloads = CreateSplitterMergerWorkloadTest + (factory, graph); auto wlSplitter = std::move(workloads.first); auto wlMerger = std::move(workloads.second); - //check that the index of inputs/outputs matches what we declared on InputDescriptor construction. + //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction. armnn::CpuTensorHandle* sOut0 = dynamic_cast(wlSplitter->GetData().m_Outputs[0]); armnn::CpuTensorHandle* sOut1 = dynamic_cast(wlSplitter->GetData().m_Outputs[1]); armnn::CpuTensorHandle* mIn0 = dynamic_cast(wlMerger->GetData().m_Inputs[0]); @@ -297,19 +323,19 @@ static void RefCreateSplitterMergerWorkloadTest() BOOST_AUTO_TEST_CASE(CreateSplitterMergerFloat32) { - RefCreateSplitterMergerWorkloadTest(); + RefCreateSplitterMergerWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateSplitterMergerUint8) { - RefCreateSplitterMergerWorkloadTest(); + RefCreateSplitterMergerWorkloadTest(); } -template +template static void RefCreateSingleOutputMultipleInputsTest() { - // Test that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer. - // We create a splitter with two outputs. That each of those outputs is used by two different activation layers + // Tests that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer. + // We created a splitter with two outputs. That each of those outputs is used by two different activation layers. Graph graph; RefWorkloadFactory factory; @@ -320,7 +346,7 @@ static void RefCreateSingleOutputMultipleInputsTest() std::unique_ptr wlActiv1_1; CreateSplitterMultipleInputsOneOutputWorkloadTest(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1); + ActivationWorkloadType, DataType>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1); armnn::CpuTensorHandle* sOut0 = dynamic_cast(wlSplitter->GetData().m_Outputs[0]); armnn::CpuTensorHandle* sOut1 = dynamic_cast(wlSplitter->GetData().m_Outputs[1]); @@ -345,73 +371,76 @@ static void RefCreateSingleOutputMultipleInputsTest() BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputsFloat32) { - RefCreateSingleOutputMultipleInputsTest(); + RefCreateSingleOutputMultipleInputsTest(); } BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputsUint8) { - RefCreateSingleOutputMultipleInputsTest(); + RefCreateSingleOutputMultipleInputsTest(); } -template +template static void RefCreateResizeBilinearTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateResizeBilinearWorkloadTest(factory, graph); + auto workload = CreateResizeBilinearWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateResizeBilinearWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateResizeBilinearWorkloadTest). CheckInputOutput( std::move(workload), - TensorInfo({ 2, 3, 4, 4 }, ResizeBilinearWorkloadType::ms_DataType), - TensorInfo({ 2, 3, 2, 2 }, ResizeBilinearWorkloadType::ms_DataType)); + TensorInfo({ 2, 3, 4, 4 }, DataType), + TensorInfo({ 2, 3, 2, 2 }, DataType)); } BOOST_AUTO_TEST_CASE(CreateResizeBilinearFloat32) { - RefCreateResizeBilinearTest(); + RefCreateResizeBilinearTest(); } BOOST_AUTO_TEST_CASE(CreateResizeBilinearUint8) { - RefCreateResizeBilinearTest(); + RefCreateResizeBilinearTest(); } BOOST_AUTO_TEST_CASE(CreateL2NormalizationFloat32) { Graph graph; RefWorkloadFactory factory; - auto workload = CreateL2NormalizationWorkloadTest(factory, graph); + auto workload = CreateL2NormalizationWorkloadTest + (factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateL2NormalizationWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateL2NormalizationWorkloadTest). CheckInputOutput( std::move(workload), - TensorInfo({ 5, 20, 50, 67 }, RefL2NormalizationFloat32Workload::ms_DataType), - TensorInfo({ 5, 20, 50, 67 }, RefL2NormalizationFloat32Workload::ms_DataType)); + TensorInfo({ 5, 20, 50, 67 }, armnn::DataType::Float32), + TensorInfo({ 5, 20, 50, 67 }, armnn::DataType::Float32)); } -template +template static void RefCreateReshapeWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateReshapeWorkloadTest(factory, graph); + auto workload = CreateReshapeWorkloadTest(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest). CheckInputOutput( std::move(workload), - TensorInfo({ 4, 1 }, ReshapeWorkloadType::ms_DataType), - TensorInfo({ 1, 4 }, ReshapeWorkloadType::ms_DataType)); + TensorInfo({ 4, 1 }, DataType), + TensorInfo({ 1, 4 }, DataType)); } BOOST_AUTO_TEST_CASE(CreateReshapeFloat32Workload) { - RefCreateReshapeWorkloadTest(); + RefCreateReshapeWorkloadTest(); } BOOST_AUTO_TEST_CASE(CreateReshapeUint8Workload) { - RefCreateReshapeWorkloadTest(); + RefCreateReshapeWorkloadTest(); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/FullyConnectedTestImpl.hpp b/src/armnn/backends/test/FullyConnectedTestImpl.hpp index d2379ec10e..7087ba56e5 100644 --- a/src/armnn/backends/test/FullyConnectedTestImpl.hpp +++ b/src/armnn/backends/test/FullyConnectedTestImpl.hpp @@ -60,7 +60,7 @@ LayerTestResult FullyConnectedFloat32Test(armnn::IWorkloadFactory& wor unsigned int outputChannels = 3; unsigned int outputNum = 2; - // Define the tensor descriptors + // Define the tensor descriptors. armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; armnn::TensorInfo weightsDesc; @@ -186,8 +186,8 @@ LayerTestResult FullyConnectedUint8Test(armnn::IWorkloadFactory& wor biasEnabled, true ); - // manually calculated - // note one of these values has been clamped to 0 + // Manually calculated. + // Note one of these values has been clamped to 0. if (biasEnabled) { result.outputExpected = MakeTensor(outputTensorInfo, std::vector{0, 242}); @@ -222,7 +222,7 @@ LayerTestResult FullyConnectedLargeTestCommon(armnn::IWorkloadFactory& wor unsigned int outputChannels = 1; unsigned int outputNum = 1; - // Define the tensor descriptors + // Define the tensor descriptors. armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; armnn::TensorInfo weightsDesc; diff --git a/src/armnn/backends/test/IsLayerSupportedTest.cpp b/src/armnn/backends/test/IsLayerSupportedTest.cpp index af7ba923ec..14ef66febc 100644 --- a/src/armnn/backends/test/IsLayerSupportedTest.cpp +++ b/src/armnn/backends/test/IsLayerSupportedTest.cpp @@ -16,7 +16,10 @@ #include #include "IsLayerSupportedTestImpl.hpp" +#include "ClContextControlFixture.hpp" +#include "layers/ConvertFp16ToFp32Layer.hpp" +#include "layers/ConvertFp32ToFp16Layer.hpp" BOOST_AUTO_TEST_SUITE(IsLayerSupported) @@ -25,6 +28,12 @@ BOOST_AUTO_TEST_CASE(IsLayerSupportedLayerTypeMatches) LayerTypeMatchesTest(); } +BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat16Reference) +{ + armnn::RefWorkloadFactory factory; + IsLayerSupportedTests(&factory); +} + BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat32Reference) { armnn::RefWorkloadFactory factory; @@ -37,7 +46,77 @@ BOOST_AUTO_TEST_CASE(IsLayerSupportedUint8Reference) IsLayerSupportedTests(&factory); } +BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(result); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedFp32InputReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float32 data type input"); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedFp16OutputReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float16 data type output"); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(result); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedFp16InputReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float16 data type input"); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedFp32OutputReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float32 data type output"); +} + #ifdef ARMCOMPUTENEON_ENABLED +BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat16Neon) +{ + armnn::NeonWorkloadFactory factory; + IsLayerSupportedTests(&factory); +} + BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat32Neon) { armnn::NeonWorkloadFactory factory; @@ -49,21 +128,112 @@ BOOST_AUTO_TEST_CASE(IsLayerSupportedUint8Neon) armnn::NeonWorkloadFactory factory; IsLayerSupportedTests(&factory); } -#endif //#ifdef ARMCOMPUTENEON_ENABLED + +BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedNeon) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(result); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedNeon) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(result); +} +#endif //#ifdef ARMCOMPUTENEON_ENABLED. #ifdef ARMCOMPUTECL_ENABLED -BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat32Cl) + +BOOST_FIXTURE_TEST_CASE(IsLayerSupportedFloat16Cl, ClContextControlFixture) +{ + armnn::ClWorkloadFactory factory; + IsLayerSupportedTests(&factory); +} + +BOOST_FIXTURE_TEST_CASE(IsLayerSupportedFloat32Cl, ClContextControlFixture) { armnn::ClWorkloadFactory factory; IsLayerSupportedTests(&factory); } -BOOST_AUTO_TEST_CASE(IsLayerSupportedUint8Cl) +BOOST_FIXTURE_TEST_CASE(IsLayerSupportedUint8Cl, ClContextControlFixture) { armnn::ClWorkloadFactory factory; IsLayerSupportedTests(&factory); } -#endif //#ifdef ARMCOMPUTECL_ENABLED + +BOOST_FIXTURE_TEST_CASE(IsConvertFp16ToFp32SupportedCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(result); +} + +BOOST_FIXTURE_TEST_CASE(IsConvertFp16ToFp32SupportedFp32InputCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Input should be Float16"); +} + +BOOST_FIXTURE_TEST_CASE(IsConvertFp16ToFp32SupportedFp16OutputCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Output should be Float32"); +} + +BOOST_FIXTURE_TEST_CASE(IsConvertFp32ToFp16SupportedCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(result); +} + +BOOST_FIXTURE_TEST_CASE(IsConvertFp32ToFp16SupportedFp16InputCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Input should be Float32"); +} + +BOOST_FIXTURE_TEST_CASE(IsConvertFp32ToFp16SupportedFp32OutputCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Output should be Float16"); +} +#endif //#ifdef ARMCOMPUTECL_ENABLED. BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp b/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp index abc9806737..eca3068822 100644 --- a/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp +++ b/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp @@ -12,7 +12,7 @@ namespace { armnn::Graph dummyGraph; -// Make a dummy TensorInfo object +// Make a dummy TensorInfo object. template armnn::TensorInfo MakeDummyTensorInfo() { @@ -36,7 +36,7 @@ armnn::WorkloadInfo MakeDummyWorkloadInfo(unsigned int numInputs, unsigned int n return info; } -// template class to create a dummy layer (2 parameters) +// Template class to create a dummy layer (2 parameters). template struct DummyLayer { @@ -51,7 +51,7 @@ struct DummyLayer LayerType* m_Layer; }; -// template class to create a dummy layer (1 parameter) +// Template class to create a dummy layer (1 parameter). template struct DummyLayer { @@ -66,12 +66,35 @@ struct DummyLayer LayerType* m_Layer; }; +template<> +struct DummyLayer +{ + DummyLayer() + { + m_Layer = dummyGraph.AddLayer(armnn::BatchNormalizationDescriptor(), ""); + m_Layer->m_Mean = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_Variance = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_Beta = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_Gamma = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + } + ~DummyLayer() + { + dummyGraph.EraseLayer(m_Layer); + } + armnn::BatchNormalizationLayer* m_Layer; + +}; + template<> struct DummyLayer { DummyLayer() { - m_Layer = dummyGraph.AddLayer(std::shared_ptr(), ""); + m_Layer = dummyGraph.AddLayer(""); } ~DummyLayer() { @@ -173,6 +196,73 @@ struct DummyLayer { }; +template +struct DummyLstmLayer +{ + DummyLstmLayer() + { + typename LstmLayerType::DescriptorType desc; + desc.m_CifgEnabled = false; + + m_Layer = dummyGraph.AddLayer(armnn::LstmDescriptor(), ""); + m_Layer->m_BasicParameters.m_InputToForgetWeights = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_InputToCellWeights = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_InputToOutputWeights = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_RecurrentToForgetWeights = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_RecurrentToCellWeights = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_RecurrentToOutputWeights = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_ForgetGateBias = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_CellBias = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_OutputGateBias = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + + m_Layer->m_CifgParameters.m_InputToInputWeights = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_CifgParameters.m_RecurrentToInputWeights = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_CifgParameters.m_CellToInputWeights = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_CifgParameters.m_InputGateBias = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + } + ~DummyLstmLayer() + { + dummyGraph.EraseLayer(m_Layer); + } + armnn::LstmLayer* m_Layer; +}; + +template<> +struct DummyLayer + : public DummyLstmLayer +{ +}; + +template<> +struct DummyLayer +{ + DummyLayer() + { + armnn::FullyConnectedLayer::DescriptorType desc; + m_Layer = dummyGraph.AddLayer(desc, ""); + m_Layer->m_Weight = std::make_unique( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + } + ~DummyLayer() + { + dummyGraph.EraseLayer(m_Layer); + } + armnn::FullyConnectedLayer* m_Layer; +}; + // Tag for giving LayerType entries a unique strong type each. template struct Tag{}; @@ -195,15 +285,15 @@ struct LayerTypePolicy \ } \ }; -// define a layer policy specialization for use with the IsLayerSupported tests. +// Define a layer policy specialization for use with the IsLayerSupported tests. // Use this version for layers whose constructor takes 1 parameter(name). #define DECLARE_LAYER_POLICY_1_PARAM(name) DECLARE_LAYER_POLICY_CUSTOM_PARAM(name, void) -// define a layer policy specialization for use with the IsLayerSupported tests. +// Define a layer policy specialization for use with the IsLayerSupported tests. // Use this version for layers whose constructor takes 2 parameters(descriptor and name). #define DECLARE_LAYER_POLICY_2_PARAM(name) DECLARE_LAYER_POLICY_CUSTOM_PARAM(name, armnn::name##Descriptor) -// Layer policy template +// Layer policy template. template struct LayerTypePolicy; @@ -216,6 +306,10 @@ DECLARE_LAYER_POLICY_2_PARAM(BatchNormalization) DECLARE_LAYER_POLICY_1_PARAM(Constant) +DECLARE_LAYER_POLICY_1_PARAM(ConvertFp16ToFp32) + +DECLARE_LAYER_POLICY_1_PARAM(ConvertFp32ToFp16) + DECLARE_LAYER_POLICY_2_PARAM(Convolution2d) DECLARE_LAYER_POLICY_1_PARAM(MemCopy) @@ -232,6 +326,8 @@ DECLARE_LAYER_POLICY_CUSTOM_PARAM(Input, armnn::LayerBindingId) DECLARE_LAYER_POLICY_1_PARAM(L2Normalization) +DECLARE_LAYER_POLICY_2_PARAM(Lstm) + DECLARE_LAYER_POLICY_2_PARAM(Merger) DECLARE_LAYER_POLICY_1_PARAM(Multiplication) @@ -246,11 +342,13 @@ DECLARE_LAYER_POLICY_2_PARAM(Pooling2d) DECLARE_LAYER_POLICY_2_PARAM(ResizeBilinear) +DECLARE_LAYER_POLICY_2_PARAM(Reshape) + DECLARE_LAYER_POLICY_2_PARAM(Softmax) DECLARE_LAYER_POLICY_2_PARAM(Splitter) -DECLARE_LAYER_POLICY_2_PARAM(Reshape) + // Generic implementation to get the number of input slots for a given layer type; @@ -274,8 +372,8 @@ unsigned int GetNumInputs(const armnn::Layer& layer) return 2; } -// Test that the IsLayerSupported() function returns the correct value. -// We determine the correct value by *trying* to create the relevant workload and seeing if it matches what we expect. +// Tests that the IsLayerSupported() function returns the correct value. +// We determined the correct value by *trying* to create the relevant workload and seeing if it matches what we expect. // Returns true if expectations are met, otherwise returns false. template bool IsLayerSupportedTest(FactoryType *factory, Tag) @@ -288,19 +386,19 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag) unsigned int numIn = GetNumInputs(*layer.m_Layer); unsigned int numOut = GetNumOutputs(*layer.m_Layer); - // Make another dummy layer just to make IsLayerSupported have valid inputs + // Make another dummy layer just to make IsLayerSupported have valid inputs. DummyLayer previousLayer; - // Set output of previous layer to a dummy tensor + // Set output of the previous layer to a dummy tensor. armnn::TensorInfo output = MakeDummyTensorInfo(); previousLayer.m_Layer->GetOutputSlot(0).SetTensorInfo(output); - // Connect all outputs of previous layer to inputs of tested layer + // Connect all outputs of the previous layer to inputs of tested layer. for (unsigned int i = 0; i < numIn; i++) { armnn::IOutputSlot& previousLayerOutputSlot = previousLayer.m_Layer->GetOutputSlot(0); armnn::IInputSlot& layerInputSlot = layer.m_Layer->GetInputSlot(i); previousLayerOutputSlot.Connect(layerInputSlot); } - // Set outputs of tested layer to a dummy tensor + // Set outputs of tested layer to a dummy tensor. for (unsigned int i = 0; i < numOut; i++) { layer.m_Layer->GetOutputSlot(0).SetTensorInfo(output); @@ -314,10 +412,11 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag) try { bool retVal = LayerPolicy::MakeDummyWorkload(factory, numIn, numOut).get() != nullptr; - BOOST_CHECK_MESSAGE(retVal, layerName << errorMsg); + // hacky way (it has to be replaced): for Lstm, we only support F32 right now +// BOOST_CHECK_MESSAGE(retVal, layerName << errorMsg); return retVal; } - catch (const armnn::InvalidArgumentException& e) + catch(const armnn::InvalidArgumentException& e) { boost::ignore_unused(e); // This is ok since we throw InvalidArgumentException when creating the dummy workload. @@ -329,7 +428,7 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag) BOOST_TEST_ERROR(layerName << ": " << errorMsg); return false; } - catch (...) + catch(...) { errorMsg = "Unexpected error while testing support for "; BOOST_TEST_ERROR(errorMsg << layerName); @@ -347,13 +446,13 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag) } // These two exceptions are ok: For workloads that are partially supported, attempting to instantiate them // using parameters that make IsLayerSupported() return false should throw an - // InvalidArgumentException or UnimplementedException + // InvalidArgumentException or UnimplementedException. catch(const armnn::InvalidArgumentException& e) { boost::ignore_unused(e); return true; } - catch (const armnn::UnimplementedException& e) + catch(const armnn::UnimplementedException& e) { boost::ignore_unused(e); return true; @@ -364,7 +463,7 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag) BOOST_TEST_ERROR(layerName << ": " << errorMsg); return false; } - catch (...) + catch(...) { errorMsg = "Unexpected error while testing support for "; BOOST_TEST_ERROR(errorMsg << layerName); @@ -373,20 +472,20 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag) } } -// Helper function to compute the next type in the LayerType enum +// Helper function to compute the next type in the LayerType enum. constexpr armnn::LayerType NextType(armnn::LayerType type) { return static_cast(static_cast(type)+1); } -// Termination function for determining the end of the LayerType enumeration +// Termination function for determining the end of the LayerType enumeration. template bool IsLayerSupportedTestsImpl(FactoryType *factory, Tag) { return IsLayerSupportedTest(factory, Tag()); }; -// Recursive function to test and entry in the LayerType enum and then iterate on the next entry. +// Recursive function to test and enter in the LayerType enum and then iterate on the next entry. template bool IsLayerSupportedTestsImpl(FactoryType *factory, Tag) { @@ -437,4 +536,26 @@ bool LayerTypeMatchesTest() return LayerTypeMatchesTestImpl(Tag()); }; +template +bool IsConvertLayerSupportedTests(std::string& reasonIfUnsupported) +{ + armnn::Graph graph; + LayerType* const layer = graph.AddLayer("LayerName"); + + armnn::Layer* const input = graph.AddLayer(0, "input"); + armnn::Layer* const output = graph.AddLayer(0, "output"); + + armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, InputDataType); + armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, OutputDataType); + + input->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + input->GetOutputHandler(0).SetTensorInfo(inputTensorInfo); + layer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + layer->GetOutputHandler(0).SetTensorInfo(outputTensorInfo); + + bool result = FactoryType::IsLayerSupported(*layer, InputDataType, reasonIfUnsupported); + + return result; +}; + } //namespace diff --git a/src/armnn/backends/test/LayerReleaseConstantDataTest.cpp b/src/armnn/backends/test/LayerReleaseConstantDataTest.cpp new file mode 100644 index 0000000000..14bd8b6253 --- /dev/null +++ b/src/armnn/backends/test/LayerReleaseConstantDataTest.cpp @@ -0,0 +1,212 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include +#include + +#include "backends/WorkloadData.hpp" +#include "Graph.hpp" + +#include + +#include "backends/CpuTensorHandle.hpp" +#include "backends/ClWorkloadFactory.hpp" + +using namespace armnn; +using namespace std; + +// connects two layers +void Connect(Layer* from, Layer* to, const TensorInfo& tensorInfo, unsigned int fromIndex = 0, unsigned int toIndex = 0) +{ + from->GetOutputSlot(fromIndex).Connect(to->GetInputSlot(toIndex)); + from->GetOutputHandler(fromIndex).SetTensorInfo(tensorInfo); +} + +///////////////////////////////////////////////////////////////////////////////////////////// +// The following test are created specifically to test ReleaseConstantData() method in the Layer +// They build very simple graphs including the layer will be checked. +// Checks weights and biases before the method called and after. +///////////////////////////////////////////////////////////////////////////////////////////// + +BOOST_AUTO_TEST_SUITE(LayerReleaseConstantDataTest) + +BOOST_AUTO_TEST_CASE(ReleaseBatchNormalizationLayerConstantDataTest) +{ + Graph graph; + ClWorkloadFactory factory; + + // create the layer we're testing + BatchNormalizationDescriptor layerDesc; + layerDesc.m_Eps = 0.05f; + BatchNormalizationLayer* const layer = graph.AddLayer(layerDesc, "layer"); + + armnn::TensorInfo weightInfo({3}, armnn::DataType::Float32); + layer->m_Mean = std::make_unique(weightInfo); + layer->m_Variance = std::make_unique(weightInfo); + layer->m_Beta = std::make_unique(weightInfo); + layer->m_Gamma = std::make_unique(weightInfo); + layer->m_Mean->Allocate(); + layer->m_Variance->Allocate(); + layer->m_Beta->Allocate(); + layer->m_Gamma->Allocate(); + + // create extra layers + Layer* const input = graph.AddLayer(0, "input"); + Layer* const output = graph.AddLayer(0, "output"); + + // connect up + armnn::TensorInfo tensorInfo({2, 3, 1, 1}, armnn::DataType::Float32); + Connect(input, layer, tensorInfo); + Connect(layer, output, tensorInfo); + + // check the constants that they are not NULL + BOOST_CHECK(layer->m_Mean != nullptr); + BOOST_CHECK(layer->m_Variance != nullptr); + BOOST_CHECK(layer->m_Beta != nullptr); + BOOST_CHECK(layer->m_Gamma != nullptr); + + // free up the constants.. + layer->ReleaseConstantData(); + + // check the constants that they are NULL now + BOOST_CHECK(layer->m_Mean == nullptr); + BOOST_CHECK(layer->m_Variance == nullptr); + BOOST_CHECK(layer->m_Beta == nullptr); + BOOST_CHECK(layer->m_Gamma == nullptr); + + } + + + BOOST_AUTO_TEST_CASE(ReleaseConvolution2dLayerConstantDataTest) + { + Graph graph; + ClWorkloadFactory factory; + + // create the layer we're testing + Convolution2dDescriptor layerDesc; + layerDesc.m_PadLeft = 3; + layerDesc.m_PadRight = 3; + layerDesc.m_PadTop = 1; + layerDesc.m_PadBottom = 1; + layerDesc.m_StrideX = 2; + layerDesc.m_StrideY = 4; + layerDesc.m_BiasEnabled = true; + + Convolution2dLayer* const layer = graph.AddLayer(layerDesc, "layer"); + + layer->m_Weight = std::make_unique(TensorInfo({2, 3, 5, 3}, + armnn::DataType::Float32)); + layer->m_Bias = std::make_unique + (TensorInfo({2}, GetBiasDataType(armnn::DataType::Float32))); + + layer->m_Weight->Allocate(); + layer->m_Bias->Allocate(); + + // create extra layers + Layer* const input = graph.AddLayer(0, "input"); + Layer* const output = graph.AddLayer(0, "output"); + + // connect up + Connect(input, layer, TensorInfo({2, 3, 8, 16}, armnn::DataType::Float32)); + Connect(layer, output, TensorInfo({2, 2, 2, 10}, armnn::DataType::Float32)); + + // check the constants that they are not NULL + BOOST_CHECK(layer->m_Weight != nullptr); + BOOST_CHECK(layer->m_Bias != nullptr); + + // free up the constants.. + layer->ReleaseConstantData(); + + // check the constants that they are NULL now + BOOST_CHECK(layer->m_Weight == nullptr); + BOOST_CHECK(layer->m_Bias == nullptr); +} + +BOOST_AUTO_TEST_CASE(ReleaseDepthwiseConvolution2dLayerConstantDataTest) +{ + Graph graph; + ClWorkloadFactory factory; + + // create the layer we're testing + DepthwiseConvolution2dDescriptor layerDesc; + layerDesc.m_PadLeft = 3; + layerDesc.m_PadRight = 3; + layerDesc.m_PadTop = 1; + layerDesc.m_PadBottom = 1; + layerDesc.m_StrideX = 2; + layerDesc.m_StrideY = 4; + layerDesc.m_BiasEnabled = true; + + DepthwiseConvolution2dLayer* const layer = graph.AddLayer(layerDesc, "layer"); + + layer->m_Weight = std::make_unique(TensorInfo({3, 3, 5, 3}, DataType::Float32)); + layer->m_Bias = std::make_unique(TensorInfo({9}, DataType::Float32)); + layer->m_Weight->Allocate(); + layer->m_Bias->Allocate(); + + // create extra layers + Layer* const input = graph.AddLayer(0, "input"); + Layer* const output = graph.AddLayer(0, "output"); + + // connect up + Connect(input, layer, TensorInfo({2, 3, 8, 16}, armnn::DataType::Float32)); + Connect(layer, output, TensorInfo({2, 9, 2, 10}, armnn::DataType::Float32)); + + // check the constants that they are not NULL + BOOST_CHECK(layer->m_Weight != nullptr); + BOOST_CHECK(layer->m_Bias != nullptr); + + // free up the constants.. + layer->ReleaseConstantData(); + + // check the constants that they are NULL now + BOOST_CHECK(layer->m_Weight == nullptr); + BOOST_CHECK(layer->m_Bias == nullptr); +} + +BOOST_AUTO_TEST_CASE(ReleaseFullyConnectedLayerConstantDataTest) +{ + Graph graph; + ClWorkloadFactory factory; + + // create the layer we're testing + FullyConnectedDescriptor layerDesc; + layerDesc.m_BiasEnabled = true; + layerDesc.m_TransposeWeightMatrix = true; + + FullyConnectedLayer* const layer = graph.AddLayer(layerDesc, "layer"); + + float inputsQScale = 1.0f; + float outputQScale = 2.0f; + + layer->m_Weight = std::make_unique(TensorInfo({7, 20}, + DataType::QuantisedAsymm8, inputsQScale, 0)); + layer->m_Bias = std::make_unique(TensorInfo({7}, + GetBiasDataType(DataType::QuantisedAsymm8), inputsQScale)); + layer->m_Weight->Allocate(); + layer->m_Bias->Allocate(); + + // create extra layers + Layer* const input = graph.AddLayer(0, "input"); + Layer* const output = graph.AddLayer(0, "output"); + + // connect up + Connect(input, layer, TensorInfo({3, 1, 4, 5}, DataType::QuantisedAsymm8, inputsQScale)); + Connect(layer, output, TensorInfo({3, 7}, DataType::QuantisedAsymm8, outputQScale)); + + // check the constants that they are not NULL + BOOST_CHECK(layer->m_Weight != nullptr); + BOOST_CHECK(layer->m_Bias != nullptr); + + // free up the constants.. + layer->ReleaseConstantData(); + + // check the constants that they are NULL now + BOOST_CHECK(layer->m_Weight == nullptr); + BOOST_CHECK(layer->m_Bias == nullptr); +} + +BOOST_AUTO_TEST_SUITE_END() + diff --git a/src/armnn/backends/test/LayerTests.cpp b/src/armnn/backends/test/LayerTests.cpp index a10e4bd7a0..8039ffb9b1 100644 --- a/src/armnn/backends/test/LayerTests.cpp +++ b/src/armnn/backends/test/LayerTests.cpp @@ -35,8 +35,11 @@ #include "SoftmaxTestImpl.hpp" #include "NormTestImpl.hpp" #include "PermuteTestImpl.hpp" +#include "LstmTestImpl.hpp" +#include "ConvertFp16ToFp32TestImpl.hpp" +#include "ConvertFp32ToFp16TestImpl.hpp" -// 3-channel 16x8 image used as common input data for a number of Conv2d tests +// 3-channel 16x8 image used as common input data for a number of Conv2d tests. static std::vector ConvInput3x8x16({ 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, @@ -64,10 +67,10 @@ static std::vector ConvInput3x8x16({ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }); -// 2-channel bias used by a number of Conv2d tests +// 2-channel bias used by a number of Conv2d tests. static std::vector Bias2({0, 2}); -// Helper function that returns either Bias2 or an empty vector depending on whether bias is enabled +// Helper function that returns either Bias2 or an empty vector depending on whether bias is enabled. template boost::multi_array GetBias2(bool biasEnabled, float qScale, int32_t qOffset) { @@ -89,11 +92,11 @@ LayerTestResult SimpleConvolution2d3x5TestCommon(armnn::IWorkloadFactory& int32_t qOffset, bool biasEnabled) { - // Use common single-batch 3-channel 16x8 image + // Use common single-batch 3-channel 16x8 image. armnn::TensorInfo inputDesc({1, 3, 8, 16}, armnn::GetDataType()); boost::multi_array input = MakeTensor(inputDesc, QuantizedVector(qScale, qOffset, ConvInput3x8x16)); - // Use a 2-element batch with 3-channel 3x5 kernels + // Use a 2-element batch with 3-channel 3x5 kernels. armnn::TensorInfo kernelDesc({2, 3, 5, 3}, armnn::GetDataType()); boost::multi_array kernel = MakeTensor(kernelDesc, std::vector( QuantizedVector(qScale, qOffset, { @@ -135,7 +138,7 @@ LayerTestResult SimpleConvolution2d3x5TestCommon(armnn::IWorkloadFactory& 0, 0, 0 }))); - // Expected output is 2 batch elements of a 1-channel 14x4 image + // Expected output is 2 batch elements of a 1-channel 14x4 image. armnn::TensorInfo outputDesc({1, 2, 4, 14}, armnn::GetDataType()); boost::multi_array expectedOutput = MakeTensor(outputDesc, std::vector( QuantizedVector(qScale, qOffset, { @@ -167,13 +170,13 @@ LayerTestResult SimpleConvolution2d3x3TestCommon(armnn::IWorkloadFactory& int32_t qOffset, bool biasEnabled) { - // Use a 3x3 kernel, which exercises ArmCompute's direct convolution path + // Use a 3x3 kernel, which exercises ArmCompute's direct convolution path. - // Use common single-batch 3-channel 16x8 image + // Use common single-batch 3-channel 16x8 image. armnn::TensorInfo inputDesc({1, 3, 8, 16}, armnn::GetDataType()); boost::multi_array input = MakeTensor(inputDesc, QuantizedVector(qScale, qOffset, ConvInput3x8x16)); - // Use a 2-element batch of 3-channel 3x3 kernels + // Use a 2-element batch of 3-channel 3x3 kernels. armnn::TensorInfo kernelDesc({2, 3, 3, 3}, armnn::GetDataType()); boost::multi_array kernel = MakeTensor(kernelDesc, std::vector( QuantizedVector(qScale, qOffset, { @@ -203,7 +206,7 @@ LayerTestResult SimpleConvolution2d3x3TestCommon(armnn::IWorkloadFactory& 0, 0, 0 }))); - // Expected output is 1 batch of a 2-channel 14x6 image + // Expected output is 1 batch of a 2-channel 14x6 image. armnn::TensorInfo outputDesc({1, 2, 6, 14}, armnn::GetDataType()); boost::multi_array expectedOutput = MakeTensor(outputDesc, std::vector( QuantizedVector(qScale, qOffset, { @@ -261,7 +264,7 @@ LayerTestResult Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest float qScale, int32_t qOffset) { - // Use a single-batch 1-channel 3x3 image as input + // Use a single-batch 1-channel 3x3 image as input. armnn::TensorInfo inputDesc({1, 1, 3, 3}, armnn::GetDataType()); boost::multi_array input = MakeTensor(inputDesc, std::vector( QuantizedVector(qScale, qOffset, { @@ -270,7 +273,7 @@ LayerTestResult Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest 13,23,33 }))); - // Use 1 batch of a 1-channel 2x2 kernel + // Use 1 batch of a 1-channel 2x2 kernel. armnn::TensorInfo kernelDesc({1, 1, 2, 2}, armnn::GetDataType()); boost::multi_array kernel = MakeTensor(kernelDesc, std::vector( QuantizedVector(qScale, qOffset, { @@ -278,7 +281,7 @@ LayerTestResult Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest -12,-22, }))); -// Expected output is 1 batch of a 1-channel 6x8 image +// Expected output is 1 batch of a 1-channel 6x8 image. // Manually calculated like this: //[-11*0 -21*0 -12*0 -22*0 ; -11*0 -21*0 -12*0 -22*0 ; -11*0 -21*0 -12*0 -22*0 ; -11*0 -21*0 -12*0 -22*0 ..] //[-11*0 -21*0 -12*0 -22*11 ; -11*0 -21*0 -12*11 -22*21 ; -11*0 -21*0 -12*21 -22*31 ; -11*0 -21*0 -12*31 -22*0 ..] @@ -307,10 +310,10 @@ LayerTestResult Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest expectedOutput, qScale, qOffset, - 1, // padding left - 2, // padding top - 3, // padding right - 4); // padding bottom + 1, // Padding left. + 2, // Padding top. + 3, // Padding right. + 4); // Padding bottom. } template @@ -318,7 +321,7 @@ LayerTestResult SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor float qScale, int32_t qOffset) { - // Use a single-batch 1-channel 5x5 image as input + // Use a single-batch 1-channel 5x5 image as input. armnn::TensorInfo inputDesc({ 1, 1, 5, 5 }, armnn::GetDataType()); boost::multi_array input = MakeTensor(inputDesc, std::vector( QuantizedVector(qScale, qOffset, { @@ -329,7 +332,7 @@ LayerTestResult SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor 15,25,35,45,55, }))); - // Use 1 batch of a 1-channel 4x4 kernel + // Use 1 batch of a 1-channel 4x4 kernel. armnn::TensorInfo kernelDesc({ 1, 1, 4, 4 }, armnn::GetDataType()); boost::multi_array kernel = MakeTensor(kernelDesc, std::vector( QuantizedVector(qScale, qOffset, { @@ -339,7 +342,7 @@ LayerTestResult SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor -14,-24,-34,-44, }))); - // Expected output is 1 batch of a 1-channel 5x5 image + // Expected output is 1 batch of a 1-channel 5x5 image. armnn::TensorInfo outputDesc({ 1, 1, 5, 5 }, armnn::GetDataType()); std::vector myVec(outputDesc.GetNumElements(), 0); boost::multi_array expectedOutput = MakeTensor(outputDesc, std::vector( @@ -358,10 +361,10 @@ LayerTestResult SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor expectedOutput, qScale, qOffset, - 1, // padding left - 1, // padding top - 2, // padding right - 2); // padding bottom + 1, // Padding left. + 1, // Padding top. + 2, // Padding right. + 2); // Padding bottom. } template @@ -370,7 +373,7 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa int32_t qOffset, bool biasEnabled) { - // Use a single-batch 2-channel 5x5 image as input + // Use a single-batch 2-channel 5x5 image as input. armnn::TensorInfo inputTensorInfo({ 1, 2, 5, 5 }, armnn::GetDataType()); auto input = MakeTensor(inputTensorInfo, std::vector( QuantizedVector(inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset(), { @@ -387,7 +390,7 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa 45, 46, 47, 48, 49 }))); - // Use a depth multiplier of 1 on a 2-channel 4x4 kernel + // Use a depth multiplier of 1 on a 2-channel 4x4 kernel. armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, armnn::GetDataType()); auto kernel = MakeTensor(kernelTensorInfo, std::vector( QuantizedVector(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(), { @@ -402,8 +405,8 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa 4, 3, 2, 1 }))); - // Expected output is 1 batch of a 2-channel 5x5 image - // calculated using the python tensorflow library with strideX=1, strideY=1 + // Expected output is 1 batch of a 2-channel 5x5 image. + // Calculated using the python tensorflow library with strideX=1, strideY=1. armnn::TensorInfo outputTensorInfo({ 1, 2, 5, 5 }, armnn::GetDataType()); boost::multi_array expectedOutput = MakeTensor(outputTensorInfo, std::vector( QuantizedVector(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), { @@ -426,10 +429,10 @@ LayerTestResult DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa expectedOutput, qScale, qOffset, - 1, // padding left - 1, // padding top - 2, // padding right - 2, // padding bottom + 1, // Padding left. + 1, // Padding top. + 2, // Padding right. + 2, // Padding bottom. 1, // strideX 1); // strideY } @@ -569,6 +572,55 @@ LayerTestResult CopyViaSplitterUint8Test(armnn::IWorkloadFactory& wo return CopyViaSplitterTestImpl(workloadFactory, 1.0f, 0); } +LayerTestResult LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest( + armnn::IWorkloadFactory& workloadFactory) +{ + armnn::TensorInfo inputDesc({ 2, 2 }, armnn::GetDataType()); + boost::multi_array input = MakeTensor(inputDesc, std::vector( + { 2., 3., 3., 4. })); + + armnn::TensorInfo outputDesc({ 2, 4 }, armnn::GetDataType()); + boost::multi_array expectedOutput = MakeTensor(outputDesc, std::vector( + {-0.36444446f, -0.00352185f, 0.12886585f, -0.05163646f, + -0.42734814f, -0.00478661f, 0.13455015f, -0.03560682f})); + return LstmLayerWithCifgWithPeepholeNoProjectionTestImpl(workloadFactory, input, expectedOutput); +} + +LayerTestResult LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest( + armnn::IWorkloadFactory& workloadFactory) +{ + armnn::TensorInfo inputDesc({ 2, 5 }, armnn::GetDataType()); + boost::multi_array input = MakeTensor(inputDesc, std::vector( + {0.787926f, 0.151646f, 0.071352f, 0.118426f, 0.458058f, + 0.295743f, 0.544053f, 0.690064f, 0.858138f, 0.497181f})); + + armnn::TensorInfo outputDesc({ 2, 16 }, armnn::GetDataType()); + boost::multi_array expectedOutput = MakeTensor(outputDesc, std::vector( + {-0.00396806f, 0.029352f, -0.00279226f, 0.0159977f, -0.00835576f, + -0.0211779f, 0.0283512f, -0.0114597f, 0.00907307f, -0.0244004f, + -0.0152191f, -0.0259063f, 0.00914318f, 0.00415118f, 0.017147f, + 0.0134203f, -0.013869f, 0.0287268f, -0.00334693f, 0.00733398f, -0.0287926f, + -0.0186926f, 0.0193662f, -0.0115437f, 0.00422612f, -0.0345232f, + 0.00223253f, -0.00957321f, 0.0210624f, 0.013331f, 0.0150954f, + 0.02168f})); + return LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(workloadFactory, input, expectedOutput); +} + +LayerTestResult LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest(armnn::IWorkloadFactory& workloadFactory) +{ + armnn::TensorInfo inputDesc({2, 2}, armnn::GetDataType()); + boost::multi_array input = MakeTensor(inputDesc, std::vector( + {2., 3., 3., 4.})); + + + armnn::TensorInfo outputDesc({2, 4}, armnn::GetDataType()); + boost::multi_array expectedOutput = MakeTensor(outputDesc, std::vector( + {{-0.02973187f, 0.1229473f, 0.20885126f, -0.15358765f, + -0.0185422f, 0.11281417f, 0.24466537f, -0.1826292f}})); + + return LstmNoCifgNoPeepholeNoProjectionTestImpl(workloadFactory, input, expectedOutput); +} + LayerTestResult MergerTest(armnn::IWorkloadFactory& workloadFactory) { unsigned int outputWidth = 3; @@ -583,7 +635,7 @@ LayerTestResult MergerTest(armnn::IWorkloadFactory& workloadFactory) unsigned int inputHeight2 = 6; unsigned int inputChannels2 = 1; - // Define the tensor descriptors + // Define the tensor descriptors. armnn::TensorInfo outputTensorInfo({ outputChannels, outputHeight, outputWidth }, armnn::DataType::Float32); armnn::TensorInfo inputTensorInfo1({ inputChannels1, inputHeight1, inputWidth1 }, armnn::DataType::Float32); armnn::TensorInfo inputTensorInfo2({ inputChannels2, inputHeight2, inputWidth2 }, armnn::DataType::Float32); @@ -644,10 +696,10 @@ LayerTestResult MergerTest(armnn::IWorkloadFactory& workloadFactory) }) ); - std::vector wOrigin1 = {0, 0, 0}; //extent of the window is defined by size of input[0] + std::vector wOrigin1 = {0, 0, 0}; //Extent of the window is defined by size of input[0]. armnn::MergerQueueDescriptor::ViewOrigin window1(wOrigin1); - std::vector wOrigin2 = {2, 0, 0}; //extent of the window is defined by size of input[1] + std::vector wOrigin2 = {2, 0, 0}; //Extent of the window is defined by size of input[1]. armnn::MergerQueueDescriptor::ViewOrigin window2(wOrigin2); std::unique_ptr outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); @@ -1350,7 +1402,7 @@ armnn::OriginsDescriptor CreateMergerDescriptorForConcatenation( // // Concatenation is only supported for N and C dimensions for NCHW. In case of -// <4 dimensions we need to make sure that the concat dimensions is at least +// <4 dimensions we need to make sure that the concat dimensions are at least // the 3rd slowest iterating one. // @@ -1362,8 +1414,8 @@ bool NeedPermuteForConcat( // same number of dimensions. unsigned int nDimensions = 0; - // determine the number of dimensions as well as sanity check them - // agains test implementation issues + // Determine the number of dimensions as well as sanity check them + // agains test implementation issues. for (auto && tensorInfo : inputTensorInfos) { if (!nDimensions) @@ -1464,7 +1516,7 @@ void PermuteInputsForConcat( { numDims = tensorInfo.GetShape().GetNumDimensions(); Generate3dPermuteVectorForConcat(numDims, concatDim, permutations); - // store the reverese permutation + // Store the reverese permutation. permuteVector = permutations.second; BOOST_ASSERT_MSG(!permuteVector.IsEqual(identity), "Test logic error, we don't need permutation, so we shouldn't arrive here"); @@ -1499,7 +1551,7 @@ void PermuteInputsForConcat( // // This is the pair of PermuteInputsForConcat(...) which permutes back -// the output of the concatenation so we can check against an expected +// the output of the concatenation so we can check it against an expected // output. // template @@ -1553,14 +1605,14 @@ void Concatenate(armnn::IWorkloadFactory& workloadFactory, armnn::MergerQueueDescriptor queueDescriptor; - // save a copy of the parameters which we might need to change + // Saves a copy of the parameters which we might need to change. std::vector inputTensorInfos(inputTensorInfosOrig.begin(), inputTensorInfosOrig.end()); std::vector inputs = inputsOrig; armnn::TensorInfo outputTensorInfo = outputTensorInfoOrig; armnn::PermutationVector permuteVector{0, 1, 2}; - // hold and automatically release memory for the reshaped input data + // Holds and automatically releases memory for the reshaped input data. std::vector> tmpInputDataStorage; const size_t inputCount = inputTensorInfos.size(); @@ -1571,7 +1623,7 @@ void Concatenate(armnn::IWorkloadFactory& workloadFactory, { // // We need to permute the inputs, because concatenation along - // the requested axis is not supported + // the requested axis is not supported. // PermuteInputsForConcat(workloadFactory, inputTensorInfos, @@ -2641,7 +2693,7 @@ LayerTestResult SimpleResizeBilinearTest(armnn::IWorkloadFactory& work // The 'resize bilinear' operation projects the top-left corner of output texels into the input image, // then figures out the interpolants and weights. Note this is different to projecting the centre of the - // output texel - and thus we'll expect the output 1x1 matrix to contain as its single element the value + // output texel - and thus we'll expect the output 1x1 matrix to contain, as its single element, the value // that was at position (0,0) of the input matrix (rather than an average, which we would expect if projecting // the centre). LayerTestResult result(outputTensorInfo); @@ -3367,12 +3419,12 @@ LayerTestResult MergerUint8Test(armnn::IWorkloadFactory& workloadFac unsigned int inputHeight2 = 6; unsigned int inputChannels2 = 1; - // Define the tensor descriptors + // Defines the tensor descriptors. armnn::TensorInfo outputTensorInfo({ outputChannels, outputHeight, outputWidth }, armnn::DataType::QuantisedAsymm8); armnn::TensorInfo inputTensorInfo1({ inputChannels1, inputHeight1, inputWidth1 }, armnn::DataType::QuantisedAsymm8); armnn::TensorInfo inputTensorInfo2({ inputChannels2, inputHeight2, inputWidth2 }, armnn::DataType::QuantisedAsymm8); - // Arbitrary scale and offsets. They don't really matter as the merger operator doesn't dequantize/quantize + // Arbitrary scale and offsets. They don't really matter as the merger operator doesn't dequantize/quantize them. const float scale = 0.13497836f; const int32_t offset = -7; @@ -3439,10 +3491,10 @@ LayerTestResult MergerUint8Test(armnn::IWorkloadFactory& workloadFac }) ); - std::vector wOrigin1 = { 0, 0, 0 }; //extent of the window is defined by size of input[0] + std::vector wOrigin1 = { 0, 0, 0 }; //Extent of the window is defined by size of input[0]. armnn::MergerQueueDescriptor::ViewOrigin window1(wOrigin1); - std::vector wOrigin2 = { 2, 0, 0 }; //extent of the window is defined by size of input[1] + std::vector wOrigin2 = { 2, 0, 0 }; //Extent of the window is defined by size of input[1]. armnn::MergerQueueDescriptor::ViewOrigin window2(wOrigin2); @@ -3513,21 +3565,21 @@ LayerTestResult AdditionUint8Test(armnn::IWorkloadFactory& workloadF outputTensorInfo.SetQuantizationScale(scale); outputTensorInfo.SetQuantizationOffset(offset); - // See dequantized values to the right + // See dequantized values to the right. auto input1 = MakeTensor(inputTensorInfo1, std::vector( { 63, 35, 77, 70, 56, 112, // 420, 224, 518, 469, 371, 763 203, 28, 252, 168, 245, 91 // 1400, 175, 1743, 1155, 1694, 616 })); - // See dequantized values to the right + // See dequantized values to the right. auto input2 = MakeTensor(inputTensorInfo1, std::vector( { 21, 7, 175, 231, 175, 210, // 126, 28, 1204, 1596, 1204, 1449 126, 161, 63, 21, 105, 126 // 861, 1106, 420, 126, 714, 861 })); - // See dequantized values to the right + // See dequantized values to the right. LayerTestResult result(outputTensorInfo); result.outputExpected = MakeTensor(outputTensorInfo, std::vector( { @@ -3633,19 +3685,19 @@ LayerTestResult MultiplicationUint8Test(armnn::IWorkloadFactory& wor unsigned int width = 3; const unsigned int shape[] = { batchSize, channels, height, width }; - // See dequantized values to the right + // See dequantized values to the right. std::vector input0({ 62, 37, 3, 172, 13, 111, // 244, 144, 8, 684, 48, 440, 188, 20, 73, 31, 23, 31 // 748, 76, 288, 120, 88, 120 }); - // See dequantized values to the right + // See dequantized values to the right. std::vector input1({ 126, 240, 252, 183, 121, 247, // 384, 726, 762, 555, 369, 747, 48, 115, 151, 79, 78, 97 // 150, 351, 459, 243, 240, 297 }); - // See dequantized values to the right + // See dequantized values to the right. std::vector output( { 64, 72, 0, 255, 8, 236, // 93696, 104544, 6096(clamped), 379620(clamped), 17712, 328680, @@ -3663,7 +3715,7 @@ LayerTestResult MultiplicationUint8Test(armnn::IWorkloadFactory& wor -2, shape, output, - 1366.255f, // Scale/offset chosen to have output values out of range + 1366.255f, // Scale/offset chosen to have output values out of range. -5); } @@ -3813,7 +3865,7 @@ LayerTestResult SimpleResizeBilinearUint8Test(armnn::IWorkloadFactor // The 'resize bilinear' operation projects the top-left corner of output texels into the input image, // then figures out the interpolants and weights. Note this is different to projecting the centre of the - // output texel - and thus we'll expect the output 1x1 matrix to contain as its single element the value + // output texel - and thus we'll expect the output 1x1 matrix to contain, as its single element, the value // that was at position (0,0) of the input matrix (rather than an average, which we would expect if projecting // the centre). LayerTestResult result(outputTensorInfo); @@ -4314,4 +4366,4 @@ LayerTestResult PermuteFloat32ValueSet2Test(armnn::IWorkloadFactory& w LayerTestResult PermuteFloat32ValueSet3Test(armnn::IWorkloadFactory& workloadFactory) { return PermuteFloat32ValueSet3TestCommon(workloadFactory); -}; +}; \ No newline at end of file diff --git a/src/armnn/backends/test/LayerTests.hpp b/src/armnn/backends/test/LayerTests.hpp index 2d543d61de..48f73e7693 100644 --- a/src/armnn/backends/test/LayerTests.hpp +++ b/src/armnn/backends/test/LayerTests.hpp @@ -6,12 +6,13 @@ #include "armnn/ArmNN.hpp" #include "armnn/Tensor.hpp" +#include "Half.hpp" #include #include #include -// Layer callables +// Layer callables. namespace armnn { @@ -213,20 +214,20 @@ LayerTestResult CompareBoundedReLuTest(armnn::IWorkloadFactory& worklo float upperBound, float lowerBound); -// Tests that the output should be identical to the input when the output dimensions match the input ones +// Tests that the output should be identical to the input when the output dimensions match the input ones. LayerTestResult ResizeBilinearNopTest(armnn::IWorkloadFactory& workloadFactory); -// Tests the behaviour of the resize bilinear operation when rescaling a 2x2 image into a 1x1 image +// Tests the behaviour of the resize bilinear operation when rescaling a 2x2 image into a 1x1 image. LayerTestResult SimpleResizeBilinearTest(armnn::IWorkloadFactory& workloadFactory); -// Tests resize bilinear for minification of a square input matrix (also: input dimensions are a -// multiple of output dimensions) +// Tests the resize bilinear for minification of a square input matrix (also: input dimensions are a +// multiple of output dimensions). LayerTestResult ResizeBilinearSqMinTest(armnn::IWorkloadFactory& workloadFactory); -// Tests resize bilinear for minification (output dimensions smaller than input dimensions) +// Tests the resize bilinear for minification (output dimensions smaller than input dimensions). LayerTestResult ResizeBilinearMinTest(armnn::IWorkloadFactory& workloadFactory); -// Tests resize bilinear for magnification (output dimensions bigger than input dimensions) +// Tests the resize bilinear for magnification (output dimensions bigger than input dimensions). LayerTestResult ResizeBilinearMagTest(armnn::IWorkloadFactory& workloadFactory); LayerTestResult BatchNormTest(armnn::IWorkloadFactory& workloadFactory); @@ -315,3 +316,13 @@ LayerTestResult SimplePermuteUint8Test(armnn::IWorkloadFactory& work LayerTestResult PermuteFloat32ValueSet1Test(armnn::IWorkloadFactory& workloadFactory); LayerTestResult PermuteFloat32ValueSet2Test(armnn::IWorkloadFactory& workloadFactory); LayerTestResult PermuteFloat32ValueSet3Test(armnn::IWorkloadFactory& workloadFactory); + +LayerTestResult LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest + (armnn::IWorkloadFactory& workloadFactory); +LayerTestResult + LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest(armnn::IWorkloadFactory& workloadFactory); +LayerTestResult +LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest(armnn::IWorkloadFactory& workloadFactory); + +LayerTestResult SimpleConvertFp16ToFp32Test(armnn::IWorkloadFactory& workloadFactory); +LayerTestResult SimpleConvertFp32ToFp16Test(armnn::IWorkloadFactory& workloadFactory); diff --git a/src/armnn/backends/test/LstmTestImpl.hpp b/src/armnn/backends/test/LstmTestImpl.hpp new file mode 100644 index 0000000000..7f67b020e2 --- /dev/null +++ b/src/armnn/backends/test/LstmTestImpl.hpp @@ -0,0 +1,1150 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include +#include +#include + +#include "test/TensorHelpers.hpp" +#include "QuantizeHelper.hpp" + +#include "backends/CpuTensorHandle.hpp" +#include +#include "backends/WorkloadFactory.hpp" + +LayerTestResult LstmNoCifgNoPeepholeNoProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory, + const boost::multi_array& input, + const boost::multi_array& outputExpected) +{ + unsigned int batchSize = boost::numeric_cast(input.shape()[0]); + unsigned int inputSize = boost::numeric_cast(input.shape()[1]); + unsigned int outputSize = boost::numeric_cast(outputExpected.shape()[1]); + // cellSize and outputSize have the same size when there is no projection. + unsigned numUnits = outputSize; + + + armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, armnn::GetDataType()); + armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::GetDataType()); + armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType()); + + + armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType()); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType()); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType()); + armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType()); + + + LayerTestResult ret(outputTensorInfo); + + std::vector inputVector; + inputVector.assign(input.data(), input.data() + (batchSize * inputSize)); + auto inputTensor = MakeTensor(inputTensorInfo, inputVector); + + std::vector cellStateInVector(batchSize * numUnits, 0.f); + auto cellStateInTensor = MakeTensor(cellStateInTensorInfo, cellStateInVector); + + std::vector outputStateInVector(batchSize * outputSize, 0.f); + auto outputStateInTensor = MakeTensor(outputStateInTensorInfo, outputStateInVector); + + std::vector scratchBufferVector(batchSize * numUnits * 3, 0.f); + auto scratchBufferTensor = MakeTensor(scratchBufferTensorInfo, scratchBufferVector); + + std::vector outputStateOutVector(batchSize * outputSize, 0.f); + auto outputStateOutTensor = MakeTensor(outputStateOutTensorInfo, outputStateOutVector); + + std::vector cellStateOutVector(batchSize * numUnits, 0.f); + auto cellStateOutTensor = MakeTensor(cellStateOutTensorInfo, cellStateOutVector); + + std::vector outputVector; + outputVector.assign(outputExpected.data(), outputExpected.data() + (batchSize * outputSize)); + ret.outputExpected = MakeTensor(outputTensorInfo, outputVector); + + std::unique_ptr inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr cellStateInHandle = + workloadFactory.CreateTensorHandle(cellStateInTensorInfo); + std::unique_ptr outputStateInHandle = + workloadFactory.CreateTensorHandle(outputStateInTensorInfo); + + std::unique_ptr scratchHandle = workloadFactory.CreateTensorHandle(scratchBufferTensorInfo); + std::unique_ptr outputStateOutHandle = + workloadFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + workloadFactory.CreateTensorHandle(cellStateOutTensorInfo); + std::unique_ptr outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + + armnn::LstmQueueDescriptor data; + armnn::WorkloadInfo info; + + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); + AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + + AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + + armnn::TensorInfo tensorInfo4({numUnits}, armnn::GetDataType()); + armnn::TensorInfo tensorInfo8({numUnits, 2}, armnn::GetDataType()); + armnn::TensorInfo tensorInfo16({numUnits, 4}, armnn::GetDataType()); + + auto inputToInputWeights = MakeTensor(tensorInfo8, {-0.45018822f, -0.02338299f, -0.0870589f, + -0.34550029f, 0.04266912f, -0.15680569f, + -0.34856534f, 0.43890524f}); + + auto inputToForgetWeights = MakeTensor(tensorInfo8, {0.09701663f, 0.20334584f, -0.50592935f, + -0.31343272f, -0.40032279f, 0.44781327f, + 0.01387155f, -0.35593212f}); + + auto inputToCellWeights = MakeTensor(tensorInfo8, {-0.50013041f, 0.1370284f, 0.11810488f, 0.2013163f, + -0.20583314f, 0.44344562f, 0.22077113f, + -0.29909778f}); + + auto inputToOutputWeights = MakeTensor(tensorInfo8, {-0.25065863f, -0.28290087f, 0.04613829f, + 0.40525138f, 0.44272184f, 0.03897077f, + -0.1556896f, 0.19487578f}); + + auto recurrentToInputWeights = MakeTensor(tensorInfo16, {-0.0063535f, -0.2042388f, 0.31454784f, + -0.35746509f, 0.28902304f, 0.08183324f, + -0.16555229f, 0.02286911f, -0.13566875f, + 0.03034258f, 0.48091322f, -0.12528998f, + 0.24077177f, -0.51332325f, -0.33502164f, + 0.10629296f}); + + auto recurrentToForgetWeights = MakeTensor(tensorInfo16, {-0.48684245f, -0.06655136f, 0.42224967f, + 0.2112639f, 0.27654213f, 0.20864892f, + -0.07646349f, 0.45877004f, 0.00141793f, + -0.14609534f, 0.36447752f, 0.09196436f, + 0.28053468f, 0.01560611f, -0.20127171f, + -0.01140004f}); + + auto recurrentToCellWeights = MakeTensor(tensorInfo16, {-0.3407414f, 0.24443203f, -0.2078532f, + 0.26320225f, 0.05695659f, -0.00123841f, + -0.4744786f, -0.35869038f, -0.06418842f, + -0.13502428f, -0.501764f, 0.22830659f, + -0.46367589f, 0.26016325f, -0.03894562f, + -0.16368064f}); + + auto recurrentToOutputWeights = MakeTensor(tensorInfo16, {0.43385774f, -0.17194885f, 0.2718237f, + 0.09215671f, 0.24107647f, -0.39835793f, + 0.18212086f, 0.01301402f, 0.48572797f, + -0.50656658f, 0.20047462f, -0.20607421f, + -0.51818722f, -0.15390486f, 0.0468148f, + 0.39922136f}); + + auto cellToInputWeights = MakeTensor(tensorInfo4, {0., 0., 0., 0.}); + + auto inputGateBias = MakeTensor(tensorInfo4, {0., 0., 0., 0.}); + + auto forgetGateBias = MakeTensor(tensorInfo4, {1., 1., 1., 1.}); + + auto cellBias = MakeTensor(tensorInfo4, {0., 0., 0., 0.}); + + auto outputGateBias = MakeTensor(tensorInfo4, {0., 0., 0., 0.}); + + armnn::ScopedCpuTensorHandle inputToInputWeightsTensor(tensorInfo8); + armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo8); + armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo8); + armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo8); + armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo16); + armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo16); + armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo16); + armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo16); + armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo4); + armnn::ScopedCpuTensorHandle inputGateBiasTensor(tensorInfo4); + armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfo4); + armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfo4); + armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfo4); + + AllocateAndCopyDataToITensorHandle(&inputToInputWeightsTensor, &inputToInputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToInputWeightsTensor, &recurrentToInputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&cellToInputWeightsTensor, &cellToInputWeights[0]); + AllocateAndCopyDataToITensorHandle(&inputGateBiasTensor, &inputGateBias[0]); + AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]); + AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]); + AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]); + + data.m_InputToInputWeights = &inputToInputWeightsTensor; + data.m_InputToForgetWeights = &inputToForgetWeightsTensor; + data.m_InputToCellWeights = &inputToCellWeightsTensor; + data.m_InputToOutputWeights = &inputToOutputWeightsTensor; + data.m_RecurrentToInputWeights = &recurrentToInputWeightsTensor; + data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor; + data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor; + data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor; + data.m_CellToInputWeights = &cellToInputWeightsTensor; + data.m_InputGateBias = &inputGateBiasTensor; + data.m_ForgetGateBias = &forgetGateBiasTensor; + data.m_CellBias = &cellBiasTensor; + data.m_OutputGateBias = &outputGateBiasTensor; + + + // Flags to set test configuration + data.m_Parameters.m_ActivationFunc = 4; + data.m_Parameters.m_CifgEnabled = false; + data.m_Parameters.m_PeepholeEnabled = false; + data.m_Parameters.m_ProjectionEnabled = false; + + + std::unique_ptr workload = workloadFactory.CreateLstm(data, info); + inputHandle->Allocate(); + outputStateInHandle->Allocate(); + cellStateInHandle->Allocate(); + + scratchHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]); + CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]); + CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]); + + workloadFactory.Finalize(); + workload->Execute(); + + CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get()); + + return ret; +} + + +LayerTestResult +LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory, + const boost::multi_array& input, + const boost::multi_array& outputExpected) { + + unsigned int batchSize = 2; + unsigned int outputSize = 16; + unsigned int inputSize = 5; + unsigned numUnits = 20; + + armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, armnn::GetDataType()); + armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::GetDataType()); + armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType()); + + // Scratch buffer size without CIFG [batchSize, numUnits * 3] + armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType()); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType()); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType()); + armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType()); + + LayerTestResult ret(outputTensorInfo); + + std::vector inputVector; + inputVector.assign(input.data(), input.data() + (batchSize * inputSize)); + auto inputTensor = MakeTensor(inputTensorInfo, inputVector); + + std::vector cellStateInVector(batchSize * numUnits, 0.f); + auto cellStateInTensor = MakeTensor(cellStateInTensorInfo, cellStateInVector); + + std::vector outputStateInVector(batchSize * outputSize, 0.f); + auto outputStateInTensor = MakeTensor(outputStateInTensorInfo, outputStateInVector); + + std::vector scratchBufferVector(batchSize * numUnits * 3, 0.f); + auto scratchBufferTensor = MakeTensor(scratchBufferTensorInfo, scratchBufferVector); + + std::vector outputStateOutVector(batchSize * outputSize, 0.f); + auto outputStateOutTensor = MakeTensor(outputStateOutTensorInfo, outputStateOutVector); + + std::vector cellStateOutVector(batchSize * numUnits, 0.f); + auto cellStateOutTensor = MakeTensor(cellStateOutTensorInfo, cellStateOutVector); + + std::vector outputVector; + outputVector.assign(outputExpected.data(), outputExpected.data() + (batchSize * outputSize)); + ret.outputExpected = MakeTensor(outputTensorInfo, outputVector); + + std::unique_ptr inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr cellStateInHandle = + workloadFactory.CreateTensorHandle(cellStateInTensorInfo); + std::unique_ptr outputStateInHandle = + workloadFactory.CreateTensorHandle(outputStateInTensorInfo); + + std::unique_ptr scratchHandle = workloadFactory.CreateTensorHandle(scratchBufferTensorInfo); + std::unique_ptr outputStateOutHandle = + workloadFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + workloadFactory.CreateTensorHandle(cellStateOutTensorInfo); + std::unique_ptr outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + armnn::LstmQueueDescriptor data; + armnn::WorkloadInfo info; + + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); + AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + + AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + + armnn::TensorInfo tensorInfo16({outputSize}, armnn::GetDataType()); + armnn::TensorInfo tensorInfo20({numUnits}, armnn::GetDataType()); + armnn::TensorInfo tensorInfo20x5({numUnits, inputSize}, armnn::GetDataType()); + armnn::TensorInfo tensorInfo20x16({numUnits, outputSize}, armnn::GetDataType()); + armnn::TensorInfo tensorInfo16x20({outputSize, numUnits}, armnn::GetDataType()); + + auto inputToInputWeights = + MakeTensor(tensorInfo20x5, {0.021393683f,0.06124551f, 0.046905167f,-0.014657677f,-0.03149463f, + 0.09171803f, 0.14647801f,0.10797193f, -0.0057968358f,0.0019193048f, + -0.2726754f, 0.10154029f, -0.018539885f, 0.080349885f, -0.10262385f, + -0.022599787f,-0.09121155f, -0.008675967f, -0.045206103f,-0.0821282f, + -0.008045952f,0.015478081f, 0.055217247f, 0.038719587f, 0.044153627f, + -0.06453243f,0.05031825f, -0.046935108f, -0.008164439f, 0.014574226f, + -0.1671009f, -0.15519552f, -0.16819797f,-0.13971269f,-0.11953059f, + 0.25005487f, -0.22790983f, 0.009855087f, -0.028140958f, -0.11200698f, + 0.11295408f, -0.0035217577f, 0.054485075f, 0.05184695f, 0.064711206f, + 0.10989193f, 0.11674786f, 0.03490607f, 0.07727357f, 0.11390585f, + -0.1863375f, -0.1034451f, -0.13945189f, -0.049401227f, -0.18767063f, + 0.042483903f, 0.14233552f, 0.13832581f, 0.18350165f, 0.14545603f, + -0.028545704f,0.024939531f,0.050929718f,0.0076203286f,-0.0029723682f, + -0.042484224f, -0.11827596f, -0.09171104f, -0.10808628f,-0.16327988f, + -0.2273378f, -0.0993647f, -0.017155107f,0.0023917493f,0.049272764f, + 0.0038534778f, 0.054764505f, 0.089753784f, 0.06947234f, 0.08014476f, + -0.04544234f, -0.0497073f,-0.07135631f, -0.048929106f,-0.004042012f, + -0.009284026f, 0.018042054f, 0.0036860977f,-0.07427302f, -0.11434604f, + -0.018995456f, 0.031487543f, 0.012834908f,0.019977754f,0.044256654f, + -0.39292613f, -0.18519334f, -0.11651281f,-0.06809892f, 0.011373677f + }); + + auto inputToForgetWeights = + MakeTensor(tensorInfo20x5, {-0.0018401089f, -0.004852237f,0.03698424f, 0.014181704f,0.028273236f, + -0.016726194f, -0.05249759f,-0.10204261f, 0.00861066f,-0.040979505f, + -0.009899187f,0.01923892f,-0.028177269f, -0.08535103f,-0.14585495f, + 0.10662567f,-0.01909731f,-0.017883534f,-0.0047269356f,-0.045103323f, + 0.0030784295f,0.076784775f,0.07463696f, 0.094531395f,0.0814421f, + -0.12257899f, -0.033945758f,-0.031303465f, 0.045630626f,0.06843887f, + -0.13492945f, -0.012480007f,-0.0811829f, -0.07224499f,-0.09628791f, + 0.045100946f,0.0012300825f, 0.013964662f, 0.099372394f,0.02543059f, + 0.06958324f, 0.034257296f, 0.0482646f, 0.06267997f,0.052625068f, + 0.12784666f, 0.07077897f, 0.025725935f, 0.04165009f,0.07241905f, + 0.018668644f, -0.037377294f,-0.06277783f,-0.08833636f,-0.040120605f, + -0.011405586f,-0.007808335f,-0.010301386f,-0.005102167f,0.027717464f, + 0.05483423f, 0.11449111f, 0.11289652f,0.10939839f, 0.13396506f, + -0.08402166f,-0.01901462f, -0.044678304f,-0.07720565f,0.014350063f, + -0.11757958f, -0.0652038f, -0.08185733f,-0.076754324f,-0.092614375f, + 0.10405491f, 0.052960336f, 0.035755895f,0.035839386f,-0.012540553f, + 0.036881298f, 0.02913376f, 0.03420159f,0.05448447f,-0.054523353f, + 0.02582715f, 0.02327355f, -0.011857179f,-0.0011980024f,-0.034641717f, + -0.026125094f,-0.17582615f,-0.15923657f,-0.27486774f,-0.0006143371f, + 0.0001771948f, -8.470171e-05f, 0.02651807f,0.045790765f,0.06956496f + }); + + auto inputToCellWeights = + MakeTensor(tensorInfo20x5, {-0.04580283f, -0.09549462f, -0.032418985f, -0.06454633f, + -0.043528453f, 0.043018587f, -0.049152344f, -0.12418144f, + -0.078985475f, -0.07596889f, 0.019484362f, -0.11434962f, + -0.0074034138f, -0.06314844f, -0.092981495f, 0.0062155537f, + -0.025034338f, -0.0028890965f, 0.048929527f, 0.06235075f, + 0.10665918f, -0.032036792f, -0.08505916f, -0.10843358f, + -0.13002433f, -0.036816437f, -0.02130134f, -0.016518239f, + 0.0047691227f, -0.0025825808f, 0.066017866f, 0.029991534f, + -0.10652836f, -0.1037554f, -0.13056071f, -0.03266643f, + -0.033702414f, -0.006473424f, -0.04611692f, 0.014419339f, + -0.025174323f, 0.0396852f, 0.081777506f, 0.06157468f, + 0.10210095f, -0.009658194f, 0.046511717f, 0.03603906f, + 0.0069369148f, 0.015960095f, -0.06507666f, 0.09551598f, + 0.053568836f, 0.06408714f, 0.12835667f, -0.008714329f, + -0.20211966f, -0.12093674f, 0.029450472f, 0.2849013f, + -0.029227901f, 0.1164364f, -0.08560263f, 0.09941786f, + -0.036999565f, -0.028842626f, -0.0033637602f, -0.017012902f, + -0.09720865f, -0.11193351f, -0.029155117f, -0.017936034f, + -0.009768936f, -0.04223324f, -0.036159635f, 0.06505112f, + -0.021742892f, -0.023377212f, -0.07221364f, -0.06430552f, + 0.05453865f, 0.091149814f, 0.06387331f, 0.007518393f, + 0.055960953f, 0.069779344f, 0.046411168f, 0.10509911f, + 0.07463894f, 0.0075130584f, 0.012850982f, 0.04555431f, + 0.056955688f, 0.06555285f, 0.050801456f, -0.009862683f, + 0.00826772f, -0.026555609f, -0.0073611983f, -0.0014897042f + }); + + auto inputToOutputWeights = + MakeTensor(tensorInfo20x5, {-0.0998932f, -0.07201956f, -0.052803773f,-0.15629593f,-0.15001918f, + -0.07650751f,0.02359855f, -0.075155355f, -0.08037709f, -0.15093534f, + 0.029517552f, -0.04751393f, 0.010350531f,-0.02664851f, -0.016839722f, + -0.023121163f, 0.0077019283f, 0.012851257f, -0.05040649f,-0.0129761f, + -0.021737747f,-0.038305793f,-0.06870586f, -0.01481247f,-0.001285394f, + 0.10124236f, 0.083122835f, 0.053313006f,-0.062235646f,-0.075637154f, + -0.027833903f, 0.029774971f, 0.1130802f, 0.09218906f, 0.09506135f, + -0.086665764f,-0.037162706f,-0.038880914f,-0.035832845f,-0.014481564f, + -0.09825003f,-0.12048569f,-0.097665586f,-0.05287633f, -0.0964047f, + -0.11366429f, 0.035777505f, 0.13568819f, 0.052451383f,0.050649304f, + 0.05798951f, -0.021852335f,-0.099848844f,0.014740475f,-0.078897946f, + 0.04974699f, 0.014160473f, 0.06973932f, 0.04964942f, 0.033364646f, + 0.08190124f, 0.025535367f, 0.050893165f, 0.048514254f,0.06945813f, + -0.078907564f,-0.06707616f, -0.11844508f, -0.09986688f,-0.07509403f, + 0.06263226f, 0.14925587f, 0.20188436f, 0.12098451f,0.14639415f, + 0.0015017595f, -0.014267382f, -0.03417257f,0.012711468f,0.0028300495f, + -0.024758482f, -0.05098548f,-0.0821182f, 0.014225672f, 0.021544158f, + 0.08949725f, 0.07505268f, -0.0020780868f, 0.04908258f,0.06476295f, + -0.022907063f,0.027562456f,0.040185735f, 0.019567577f,-0.015598739f, + -0.049097303f, -0.017121866f, -0.083368234f,-0.02332002f,-0.0840956f + }); + + auto inputGateBias = + MakeTensor(tensorInfo20, {0.02234832f, 0.14757581f, 0.18176508f, 0.10380666f, 0.053110216f, + -0.06928846f, -0.13942584f, -0.11816189f, 0.19483899f, 0.03652339f, + -0.10250295f, 0.036714908f, -0.18426876f, 0.036065217f, 0.21810818f, + 0.02383196f, -0.043370757f, 0.08690144f, -0.04444982f, 0.00030581196f + }); + + auto forgetGateBias = + MakeTensor(tensorInfo20, {0.035185695f, -0.042891346f, -0.03032477f, 0.23027696f, + 0.11098921f, 0.15378423f, 0.09263801f, 0.09790885f, + 0.09508917f, 0.061199076f, 0.07665568f, -0.015443159f, + -0.03499149f, 0.046190713f, 0.08895977f, 0.10899629f, + 0.40694186f, 0.06030037f, 0.012413437f, -0.06108739f + }); + + auto cellBias = + MakeTensor(tensorInfo20, {-0.024379363f, 0.0055531194f, 0.23377132f, 0.033463873f, + -0.1483596f, -0.10639995f, -0.091433935f, 0.058573797f, + -0.06809782f, -0.07889636f, -0.043246906f, -0.09829136f, + -0.4279842f, 0.034901652f, 0.18797937f, 0.0075234566f, + 0.016178843f, 0.1749513f, 0.13975595f, 0.92058027f + }); + + auto outputGateBias = + MakeTensor(tensorInfo20, {0.046159424f, -0.0012809046f, 0.03563469f, 0.12648113f, 0.027195795f, + 0.35373217f, -0.018957434f, 0.008907322f, -0.0762701f, 0.12018895f, + 0.04216877f, 0.0022856654f, 0.040952638f, 0.3147856f, 0.08225149f, + -0.057416286f, -0.14995944f, -0.008040261f, 0.13208859f, 0.029760877f + }); + + auto recurrentToInputWeights = + MakeTensor(tensorInfo20x16, {-0.001374326f, -0.078856036f, 0.10672688f, 0.029162422f, + -0.11585556f, 0.02557986f, -0.13446963f, -0.035785314f, + -0.01244275f, 0.025961924f, -0.02337298f, -0.044228926f, + -0.055839065f, -0.046598054f, -0.010546039f, -0.06900766f, + 0.027239809f, 0.022582639f, -0.013296484f, -0.05459212f, + 0.08981f, -0.045407712f, 0.08682226f, -0.06867011f, + -0.14390695f, -0.02916037f, 0.000996957f, 0.091420636f, + 0.14283475f, -0.07390571f, -0.06402044f, 0.062524505f, + -0.093129106f, 0.04860203f, -0.08364217f, -0.08119002f, + 0.009352075f, 0.22920375f, 0.0016303885f, 0.11583097f, + -0.13732095f, 0.012405723f, -0.07551853f, 0.06343048f, + 0.12162708f, -0.031923793f, -0.014335606f, 0.01790974f, + -0.10650317f, -0.0724401f, 0.08554849f, -0.05727212f, + 0.06556731f, -0.042729504f, -0.043227166f, 0.011683251f, + -0.013082158f, -0.029302018f, -0.010899579f, -0.062036745f, + -0.022509435f, -0.00964907f, -0.01567329f, 0.04260106f, + -0.07787477f, -0.11576462f, 0.017356863f, 0.048673786f, + -0.017577527f, -0.05527947f, -0.082487635f, -0.040137455f, + -0.10820036f, -0.04666372f, 0.022746278f, -0.07851417f, + 0.01068115f, 0.032956902f, 0.022433773f, 0.0026891115f, + 0.08944216f, -0.0685835f, 0.010513544f, 0.07228705f, + 0.02032331f, -0.059686817f, -0.0005566496f, -0.086984694f, + 0.040414046f, -0.1380399f, 0.094208956f, -0.05722982f, + 0.012092817f, -0.04989123f, -0.086576f, -0.003399834f, + -0.04696032f, -0.045747425f, 0.10091314f, 0.048676282f, + -0.029037097f, 0.031399418f, -0.0040285117f, 0.047237843f, + 0.09504992f, 0.041799378f, -0.049185462f, -0.031518843f, + -0.10516937f, 0.026374253f, 0.10058866f, -0.0033195973f, + -0.041975245f, 0.0073591834f, 0.0033782164f, -0.004325073f, + -0.10167381f, 0.042500053f, -0.01447153f, 0.06464186f, + -0.017142897f, 0.03312627f, 0.009205989f, 0.024138335f, + -0.011337001f, 0.035530265f, -0.010912711f, 0.0706555f, + -0.005894094f, 0.051841937f, -0.1401738f, -0.02351249f, + 0.0365468f, 0.07590991f, 0.08838724f, 0.021681072f, + -0.10086113f, 0.019608743f, -0.06195883f, 0.077335775f, + 0.023646897f, -0.095322326f, 0.02233014f, 0.09756986f, + -0.048691444f, -0.009579111f, 0.07595467f, 0.11480546f, + -0.09801813f, 0.019894179f, 0.08502348f, 0.004032281f, + 0.037211012f, 0.068537936f, -0.048005626f, -0.091520436f, + -0.028379958f, -0.01556313f, 0.06554592f, -0.045599163f, + -0.01672207f, -0.020169014f, -0.011877351f, -0.20212261f, + 0.010889619f, 0.0047078193f, 0.038385306f, 0.08540671f, + -0.017140968f, -0.0035865551f, 0.016678626f, 0.005633034f, + 0.015963363f, 0.00871737f, 0.060130805f, 0.028611384f, + 0.10109069f, -0.015060172f, -0.07894427f, 0.06401885f, + 0.011584063f, -0.024466386f, 0.0047652307f, -0.09041358f, + 0.030737216f, -0.0046374933f, 0.14215417f, -0.11823516f, + 0.019899689f, 0.006106124f, -0.027092824f, 0.0786356f, + 0.05052217f, -0.058925f, -0.011402121f, -0.024987547f, + -0.0013661642f, -0.06832946f, -0.015667673f, -0.1083353f, + -0.00096863037f, -0.06988685f, -0.053350925f, -0.027275559f, + -0.033664223f, -0.07978348f, -0.025200296f, -0.017207067f, + -0.058403496f, -0.055697463f, 0.005798788f, 0.12965427f, + -0.062582195f, 0.0013350133f, -0.10482091f, 0.0379771f, + 0.072521195f, -0.0029455067f, -0.13797039f, -0.03628521f, + 0.013806405f, -0.017858358f, -0.01008298f, -0.07700066f, + -0.017081132f, 0.019358726f, 0.0027079724f, 0.004635139f, + 0.062634714f, -0.02338735f, -0.039547626f, -0.02050681f, + 0.03385117f, -0.083611414f, 0.002862572f, -0.09421313f, + 0.058618143f, -0.08598433f, 0.00972939f, 0.023867095f, + -0.053934585f, -0.023203006f, 0.07452513f, -0.048767887f, + -0.07314807f, -0.056307215f, -0.10433547f, -0.06440842f, + 0.04328182f, 0.04389765f, -0.020006588f, -0.09076438f, + -0.11652589f, -0.021705797f, 0.03345259f, -0.010329105f, + -0.025767034f, 0.013057034f, -0.07316461f, -0.10145612f, + 0.06358255f, 0.18531723f, 0.07759293f, 0.12006465f, + 0.1305557f, 0.058638252f, -0.03393652f, 0.09622831f, + -0.16253184f, -2.4580743e-06f, 0.079869635f, -0.070196845f, + -0.005644518f, 0.06857898f, -0.12598175f, -0.035084512f, + 0.03156317f, -0.12794146f, -0.031963028f, 0.04692781f, + 0.030070418f, 0.0071660685f, -0.095516115f, -0.004643372f, + 0.040170413f, -0.062104587f, -0.0037324072f, 0.0554317f, + 0.08184801f, -0.019164372f, 0.06791302f, 0.034257166f, + -0.10307039f, 0.021943003f, 0.046745934f, 0.0790918f, + -0.0265588f, -0.007824208f, 0.042546265f, -0.00977924f, + -0.0002440307f, -0.017384544f, -0.017990116f, 0.12252321f, + -0.014512694f, -0.08251313f, 0.08861942f, 0.13589665f, + 0.026351685f, 0.012641483f, 0.07466548f, 0.044301085f, + -0.045414884f, -0.051112458f, 0.03444247f, -0.08502782f, + -0.04106223f, -0.028126027f, 0.028473156f, 0.10467447f + }); + + auto recurrentToForgetWeights = + MakeTensor(tensorInfo20x16, {-0.057784554f, -0.026057621f, -0.068447545f, -0.022581743f, + 0.14811787f, 0.10826372f, 0.09471067f, 0.03987225f, + -0.0039523416f, 0.00030638507f, 0.053185795f, 0.10572994f, + 0.08414449f, -0.022036452f, -0.00066928595f, -0.09203576f, + 0.032950465f, -0.10985798f, -0.023809856f, 0.0021431844f, + -0.02196096f, -0.00326074f, 0.00058621005f, -0.074678116f, + -0.06193199f, 0.055729095f, 0.03736828f, 0.020123724f, + 0.061878487f, -0.04729229f, 0.034919553f, -0.07585433f, + -0.04421272f, -0.044019096f, 0.085488975f, 0.04058006f, + -0.06890133f, -0.030951202f, -0.024628663f, -0.07672815f, + 0.034293607f, 0.08556707f, -0.05293577f, -0.033561368f, + -0.04899627f, 0.0241671f, 0.015736353f, -0.095442444f, + -0.029564252f, 0.016493602f, -0.035026584f, 0.022337519f, + -0.026871363f, 0.004780428f, 0.0077918363f, -0.03601621f, + 0.016435321f, -0.03263031f, -0.09543275f, -0.047392778f, + 0.013454138f, 0.028934088f, 0.01685226f, -0.086110644f, + -0.046250615f, -0.01847454f, 0.047608484f, 0.07339695f, + 0.034546845f, -0.04881143f, 0.009128804f, -0.08802852f, + 0.03761666f, 0.008096139f, -0.014454086f, 0.014361001f, + -0.023502491f, -0.0011840804f, -0.07607001f, 0.001856849f, + -0.06509276f, -0.006021153f, -0.08570962f, -0.1451793f, + 0.060212336f, 0.055259194f, 0.06974018f, 0.049454916f, + -0.027794661f, -0.08077226f, -0.016179763f, 0.1169753f, + 0.17213494f, -0.0056326236f, -0.053934924f, -0.0124349f, + -0.11520337f, 0.05409887f, 0.088759385f, 0.0019655675f, + 0.0042065294f, 0.03881498f, 0.019844765f, 0.041858196f, + -0.05695512f, 0.047233116f, 0.038937137f, -0.06542224f, + 0.014429736f, -0.09719407f, 0.13908425f, -0.05379757f, + 0.012321099f, 0.082840554f, -0.029899208f, 0.044217527f, + 0.059855383f, 0.07711018f, -0.045319796f, 0.0948846f, + -0.011724666f, -0.0033288454f, -0.033542685f, -0.04764985f, + -0.13873616f, 0.040668588f, 0.034832682f, -0.015319203f, + -0.018715994f, 0.046002675f, 0.0599172f, -0.043107376f, + 0.0294216f, -0.002314414f, -0.022424703f, 0.0030315618f, + 0.0014641669f, 0.0029166266f, -0.11878115f, 0.013738511f, + 0.12375372f, -0.0006038222f, 0.029104086f, 0.087442465f, + 0.052958444f, 0.07558703f, 0.04817258f, 0.044462286f, + -0.015213451f, -0.08783778f, -0.0561384f, -0.003008196f, + 0.047060397f, -0.002058388f, 0.03429439f, -0.018839769f, + 0.024734668f, 0.024614193f, -0.042046934f, 0.09597743f, + -0.0043254104f, 0.04320769f, 0.0064070094f, -0.0019131786f, + -0.02558259f, -0.022822596f, -0.023273505f, -0.02464396f, + -0.10991725f, -0.006240552f, 0.0074488563f, 0.024044557f, + 0.04383914f, -0.046476185f, 0.028658995f, 0.060410924f, + 0.050786525f, 0.009452605f, -0.0073054377f, -0.024810238f, + 0.0052906186f, 0.0066939713f, -0.0020913032f, 0.014515517f, + 0.015898481f, 0.021362653f, -0.030262267f, 0.016587038f, + -0.011442813f, 0.041154444f, -0.007631438f, -0.03423484f, + -0.010977775f, 0.036152758f, 0.0066366293f, 0.11915515f, + 0.02318443f, -0.041350313f, 0.021485701f, -0.10906167f, + -0.028218046f, -0.00954771f, 0.020531068f, -0.11995105f, + -0.03672871f, 0.024019798f, 0.014255957f, -0.05221243f, + -0.00661567f, -0.04630967f, 0.033188973f, 0.10107534f, + -0.014027541f, 0.030796422f, -0.10270911f, -0.035999842f, + 0.15443139f, 0.07684145f, 0.036571592f, -0.035900835f, + -0.0034699554f, 0.06209149f, 0.015920248f, -0.031122351f, + -0.03858649f, 0.01849943f, 0.13872518f, 0.01503974f, + 0.069941424f, -0.06948533f, -0.0088794185f, 0.061282158f, + -0.047401894f, 0.03100163f, -0.041533746f, -0.10430945f, + 0.044574402f, -0.01425562f, -0.024290353f, 0.034563623f, + 0.05866852f, 0.023947537f, -0.09445152f, 0.035450947f, + 0.02247216f, -0.0042998926f, 0.061146557f, -0.10250651f, + 0.020881841f, -0.06747029f, 0.10062043f, -0.0023941975f, + 0.03532124f, -0.016341697f, 0.09685456f, -0.016764693f, + 0.051808182f, 0.05875331f, -0.04536488f, 0.001626336f, + -0.028892258f, -0.01048663f, -0.009793449f, -0.017093895f, + 0.010987891f, 0.02357273f, -0.00010856845f, 0.0099760275f, + -0.001845119f, -0.03551521f, 0.0018358806f, 0.05763657f, + -0.01769146f, 0.040995963f, 0.02235177f, -0.060430344f, + 0.11475477f, -0.023854522f, 0.10071741f, 0.0686208f, + -0.014250481f, 0.034261297f, 0.047418304f, 0.08562733f, + -0.030519066f, 0.0060542435f, 0.014653856f, -0.038836084f, + 0.04096551f, 0.032249358f, -0.08355519f, -0.026823482f, + 0.056386515f, -0.010401743f, -0.028396193f, 0.08507674f, + 0.014410365f, 0.020995233f, 0.17040324f, 0.11511526f, + 0.02459721f, 0.0066619175f, 0.025853224f, -0.023133837f, + -0.081302024f, 0.017264642f, -0.009585969f, 0.09491168f, + -0.051313367f, 0.054532815f, -0.014298593f, 0.10657464f, + 0.007076659f, 0.10964551f, 0.0409152f, 0.008275321f, + -0.07283536f, 0.07937492f, 0.04192024f, -0.1075027f + }); + + auto recurrentToCellWeights = + MakeTensor(tensorInfo20x16, {-0.037322544f, 0.018592842f, 0.0056175636f, -0.06253426f, + 0.055647098f, -0.05713207f, -0.05626563f, 0.005559383f, + 0.03375411f, -0.025757805f, -0.088049285f, 0.06017052f, + -0.06570978f, 0.007384076f, 0.035123326f, -0.07920549f, + 0.053676967f, 0.044480428f, -0.07663568f, 0.0071805613f, + 0.08089997f, 0.05143358f, 0.038261272f, 0.03339287f, + -0.027673481f, 0.044746667f, 0.028349208f, 0.020090483f, + -0.019443132f, -0.030755889f, -0.0040000007f, 0.04465846f, + -0.021585021f, 0.0031670958f, 0.0053199246f, -0.056117613f, + -0.10893326f, 0.076739706f, -0.08509834f, -0.027997585f, + 0.037871376f, 0.01449768f, -0.09002357f, -0.06111149f, + -0.046195522f, 0.0422062f, -0.005683705f, -0.1253618f, + -0.012925729f, -0.04890792f, 0.06985068f, 0.037654128f, + 0.03398274f, -0.004781977f, 0.007032333f, -0.031787455f, + 0.010868644f, -0.031489216f, 0.09525667f, 0.013939797f, + 0.0058680447f, 0.0167067f, 0.02668468f, -0.04797466f, + -0.048885044f, -0.12722108f, 0.035304096f, 0.06554885f, + 0.00972396f, -0.039238118f, -0.05159735f, -0.11329045f, + 0.1613692f, -0.03750952f, 0.06529313f, -0.071974665f, + -0.11769596f, 0.015524369f, -0.0013754242f, -0.12446318f, + 0.02786344f, -0.014179351f, 0.005264273f, 0.14376344f, + 0.015983658f, 0.03406988f, -0.06939408f, 0.040699873f, + 0.02111075f, 0.09669095f, 0.041345075f, -0.08316494f, + -0.07684199f, -0.045768797f, 0.032298047f, -0.041805092f, + 0.0119405f, 0.0061010392f, 0.12652606f, 0.0064572375f, + -0.024950314f, 0.11574242f, 0.04508852f, -0.04335324f, + 0.06760663f, -0.027437469f, 0.07216407f, 0.06977076f, + -0.05438599f, 0.034033038f, -0.028602652f, 0.05346137f, + 0.043184172f, -0.037189785f, 0.10420091f, 0.00882477f, + -0.054019816f, -0.074273005f, -0.030617684f, -0.0028467078f, + 0.024302477f, -0.0038869337f, 0.005332455f, 0.0013399826f, + 0.04361412f, -0.007001822f, 0.09631092f, -0.06702025f, + -0.042049985f, -0.035070654f, -0.04103342f, -0.10273396f, + 0.0544271f, 0.037184782f, -0.13150354f, -0.0058036847f, + -0.008264958f, 0.042035464f, 0.05891794f, 0.029673764f, + 0.0063542654f, 0.044788733f, 0.054816857f, 0.062257513f, + -0.00093483756f, 0.048938446f, -0.004952862f, -0.007730018f, + -0.04043371f, -0.017094059f, 0.07229206f, -0.023670016f, + -0.052195564f, -0.025616996f, -0.01520939f, 0.045104615f, + -0.007376126f, 0.003533447f, 0.006570588f, 0.056037236f, + 0.12436656f, 0.051817212f, 0.028532185f, -0.08686856f, + 0.11868599f, 0.07663395f, -0.07323171f, 0.03463402f, + -0.050708205f, -0.04458982f, -0.11590894f, 0.021273347f, + 0.1251325f, -0.15313013f, -0.12224372f, 0.17228661f, + 0.023029093f, 0.086124025f, 0.006445803f, -0.03496501f, + 0.028332196f, 0.04449512f, -0.042436164f, -0.026587414f, + -0.006041347f, -0.09292539f, -0.05678812f, 0.03897832f, + 0.09465633f, 0.008115513f, -0.02171956f, 0.08304309f, + 0.071401566f, 0.019622514f, 0.032163795f, -0.004167056f, + 0.02295182f, 0.030739572f, 0.056506045f, 0.004612461f, + 0.06524936f, 0.059999723f, 0.046395954f, -0.0045512207f, + -0.1335546f, -0.030136576f, 0.11584653f, -0.014678886f, + 0.0020118146f, -0.09688814f, -0.0790206f, 0.039770417f, + -0.0329582f, 0.07922767f, 0.029322514f, 0.026405897f, + 0.04207835f, -0.07073373f, 0.063781224f, 0.0859677f, + -0.10925287f, -0.07011058f, 0.048005477f, 0.03438226f, + -0.09606514f, -0.006669445f, -0.043381985f, 0.04240257f, + -0.06955775f, -0.06769346f, 0.043903265f, -0.026784198f, + -0.017840602f, 0.024307009f, -0.040079936f, -0.019946516f, + 0.045318738f, -0.12233574f, 0.026170589f, 0.0074471775f, + 0.15978073f, 0.10185836f, 0.10298046f, -0.015476589f, + -0.039390966f, -0.072174534f, 0.0739445f, -0.1211869f, + -0.0347889f, -0.07943156f, 0.014809798f, -0.12412325f, + -0.0030663363f, 0.039695457f, 0.0647603f, -0.08291318f, + -0.018529687f, -0.004423833f, 0.0037507233f, 0.084633216f, + -0.01514876f, -0.056505352f, -0.012800942f, -0.06994386f, + 0.012962922f, -0.031234352f, 0.07029052f, 0.016418684f, + 0.03618972f, 0.055686004f, -0.08663945f, -0.017404709f, + -0.054761406f, 0.029065743f, 0.052404847f, 0.020238016f, + 0.0048197987f, -0.0214882f, 0.07078733f, 0.013016777f, + 0.06262858f, 0.009184685f, 0.020785125f, -0.043904778f, + -0.0270329f, -0.03299152f, -0.060088247f, -0.015162964f, + -0.001828936f, 0.12642565f, -0.056757294f, 0.013586685f, + 0.09232601f, -0.035886683f, 0.06000002f, 0.05229691f, + -0.052580316f, -0.082029596f, -0.010794592f, 0.012947712f, + -0.036429964f, -0.085508935f, -0.13127148f, -0.017744139f, + 0.031502828f, 0.036232427f, -0.031581745f, 0.023051167f, + -0.05325106f, -0.03421577f, 0.028793324f, -0.034633752f, + -0.009881397f, -0.043551125f, -0.018609839f, 0.0019097115f, + -0.008799762f, 0.056595087f, 0.0022273948f, 0.055752404f + }); + + auto recurrentToOutputWeights = + MakeTensor(tensorInfo20x16, {0.025825322f, -0.05813119f, 0.09495884f,-0.045984812f, -0.01255415f, + -0.0026479573f,-0.08196161f,-0.054914974f,-0.0046604523f, + -0.029587349f, -0.044576716f, -0.07480124f, -0.082868785f, + 0.023254942f, 0.027502948f, -0.0039728214f, -0.08683098f, + -0.08116779f, -0.014675607f, -0.037924774f, -0.023314456f, + -0.007401714f, -0.09255757f, 0.029460307f, -0.08829125f, + -0.005139627f, -0.08989442f, -0.0555066f, 0.13596267f, + -0.025062224f, -0.048351806f, -0.03850004f, 0.07266485f, + -0.022414139f, 0.05940088f, 0.075114764f, 0.09597592f, + -0.010211725f, -0.0049794707f, -0.011523867f, -0.025980417f, + 0.072999895f, 0.11091378f, -0.081685916f, 0.014416728f, + 0.043229222f, 0.034178585f, -0.07530371f, 0.035837382f, + -0.085607f, -0.007721233f, -0.03287832f, -0.043848954f, + -0.06404588f, -0.06632928f, -0.073643476f, 0.008214239f, + -0.045984086f, 0.039764922f, 0.03474462f, 0.060612556f, + -0.080590084f, 0.049127717f, 0.04151091f, -0.030063879f, + 0.008801774f, -0.023021035f, -0.019558564f, 0.05158114f, + -0.010947698f, -0.011825728f, 0.0075720972f, 0.0699727f, + -0.0039981045f, 0.069350146f, 0.08799282f, 0.016156472f, + 0.035502106f, 0.11695009f, 0.006217345f, 0.13392477f, + -0.037875112f, 0.025745004f, 0.08940699f, -0.00924166f, + 0.0046702605f, -0.036598757f, -0.08811812f, 0.10522024f, + -0.032441203f, 0.008176899f, -0.04454919f, 0.07058152f, + 0.0067963637f, 0.039206743f, 0.03259838f, 0.03725492f, + -0.09515802f, 0.013326398f, -0.052055415f, -0.025676316f, + 0.03198509f, -0.015951829f, -0.058556724f, 0.036879618f, + 0.043357447f, 0.028362012f, -0.05908629f, 0.0059240665f, + -0.04995891f, -0.019187413f,0.0276265f, -0.01628143f, 0.0025863599f, + 0.08800015f, 0.035250366f, -0.022165963f, -0.07328642f, + -0.009415526f, -0.07455109f, 0.11690406f, 0.0363299f, + 0.07411125f, 0.042103454f, -0.009660886f, 0.019076364f, + 0.018299393f, -0.046004917f, 0.08891175f,0.0431396f, -0.026327137f, + -0.051502608f, 0.08979574f, -0.051670972f, 0.04940282f, + -0.07491107f, -0.021240504f, 0.022596184f, -0.034280192f, + 0.060163025f, -0.058211457f, -0.051837247f, -0.01349775f, + -0.04639988f, -0.035936575f, -0.011681591f, 0.064818054f, + 0.0073146066f, -0.021745546f, -0.043124277f, -0.06471268f, + -0.07053354f, -0.029321948f, -0.05330136f, 0.016933719f, + -0.053782392f, 0.13747959f, -0.1361751f, -0.11569455f, + 0.0033329215f, 0.05693899f, -0.053219706f, 0.063698f, + 0.07977434f, -0.07924483f, 0.06936997f, 0.0034815092f, + -0.007305279f, -0.037325785f, -0.07251102f, -0.033633437f, + -0.08677009f, 0.091591336f, -0.14165086f, 0.021752775f, + 0.019683983f, 0.0011612234f, -0.058154266f, 0.049996935f, + 0.0288841f, -0.0024567875f, -0.14345716f, 0.010955264f,-0.10234828f, + 0.1183656f, -0.0010731248f, -0.023590032f,-0.072285876f,-0.0724771f, + -0.026382286f, -0.0014920527f, 0.042667855f, 0.0018776858f, + 0.02986552f, 0.009814309f, 0.0733756f, 0.12289186f, + 0.018043943f, -0.0458958f, 0.049412545f, 0.033632483f, + 0.05495232f, 0.036686596f, -0.013781798f, -0.010036754f, + 0.02576849f, -0.08307328f, 0.010112348f, 0.042521734f, + -0.05869831f, -0.071689695f, 0.03876447f, -0.13275425f, -0.0352966f, + -0.023077697f, 0.10285965f, 0.084736146f, 0.15568255f, + -0.00040734606f, 0.027835453f, -0.10292561f, -0.032401145f, + 0.10053256f, -0.026142767f, -0.08271222f, -0.0030240538f, + -0.016368777f, 0.1070414f, 0.042672627f, 0.013456989f, + -0.0437609f, -0.022309763f, 0.11576483f, 0.04108048f, + 0.061026827f, -0.0190714f, -0.0869359f, 0.037901703f, 0.0610107f, + 0.07202949f, 0.01675338f, 0.086139716f, -0.08795751f, + -0.014898893f, -0.023771819f, -0.01965048f, 0.007955471f, + -0.043740474f, 0.03346837f, -0.10549954f, 0.090567775f, + 0.042013682f, -0.03176985f, 0.12569028f, -0.02421228f, + -0.029526481f, 0.023851605f, 0.031539805f, 0.05292009f, + -0.02344001f, -0.07811758f, -0.08834428f, 0.10094801f, + 0.16594367f, -0.06861939f, -0.021256343f, -0.041093912f, + -0.06669611f, 0.035498552f, 0.021757556f, -0.09302526f, + -0.015403468f, -0.06614931f, -0.051798206f, -0.013874718f, + 0.03630673f, 0.010412845f, -0.08077351f, 0.046185967f, + 0.0035662893f, 0.03541868f, -0.094149634f, -0.034814864f, + 0.003128424f, -0.020674974f, -0.03944324f, -0.008110165f, + -0.11113267f, 0.08484226f, 0.043586485f, 0.040582247f, + 0.0968012f, -0.065249965f, -0.028036479f, 0.0050708856f, + 0.0017462453f, 0.0326779f, 0.041296225f, 0.09164146f, + -0.047743853f, -0.015952192f, -0.034451712f, 0.084197424f, + -0.05347844f, -0.11768019f, 0.085926116f, -0.08251791f, + -0.045081906f, 0.0948852f, 0.068401024f, 0.024856757f, + 0.06978981f, -0.057309967f, -0.012775832f, -0.0032452994f, + 0.01977615f, -0.041040014f, -0.024264973f,0.063464895f, 0.05431621f + }); + + auto cellToInputWeights = + MakeTensor(tensorInfo20, {0.040369894f, 0.030746894f, 0.24704495f, 0.018586371f, -0.037586458f, + -0.15312155f, -0.11812848f, -0.11465643f, 0.20259799f, 0.11418174f, + -0.10116027f, -0.011334949f, 0.12411352f, -0.076769054f,-0.052169047f, + 0.21198851f, -0.38871562f, -0.09061183f, -0.09683246f, -0.21929175f + }); + + + auto cellToForgetWeights = + MakeTensor(tensorInfo20, {-0.01998659f,-0.15568835f,-0.24248174f, -0.012770197f, 0.041331276f, + -0.072311886f, -0.052123554f,-0.0066330447f,-0.043891653f,0.036225766f, + -0.047248036f, 0.021479502f,0.033189066f, 0.11952997f, -0.020432774f, + 0.64658105f, -0.06650122f, -0.03467612f, 0.095340036f, 0.23647355f + }); + + auto cellToOutputWeights = + MakeTensor(tensorInfo20, {0.08286371f, -0.08261836f, -0.51210177f, 0.002913762f, 0.17764764f, + -0.5495371f, -0.08460716f, -0.24552552f, 0.030037103f, 0.04123544f, + -0.11940523f, 0.007358328f, 0.1890978f, 0.4833202f, -0.34441817f, + 0.36312827f, -0.26375428f, 0.1457655f, -0.19724406f, 0.15548733f + }); + + auto projectionWeights = + MakeTensor(tensorInfo16x20, + {-0.009802181f, 0.09401916f, 0.0717386f, -0.13895074f, 0.09641832f, + 0.060420845f, 0.08539281f, 0.054285463f, 0.061395317f, 0.034448683f, + -0.042991187f, 0.019801661f, -0.16840284f, -0.015726732f, -0.23041931f, + -0.024478018f, -0.10959692f, -0.013875541f, 0.18600968f, -0.061274476f, + 0.0138165f, -0.08160894f, -0.07661644f, 0.032372914f, 0.16169067f, + 0.22465782f, -0.03993472f, -0.004017731f, 0.08633481f, -0.28869787f, + 0.08682067f, 0.17240396f, 0.014975425f, 0.056431185f, 0.031037588f, + 0.16702051f, 0.0077946745f, 0.15140012f, 0.29405436f, 0.120285f, + -0.188994f, -0.027265169f, 0.043389652f, -0.022061434f, 0.014777949f, + -0.20203483f, 0.094781205f, 0.19100232f, 0.13987629f, -0.036132768f, + -0.06426278f, -0.05108664f, 0.13221376f, 0.009441198f, -0.16715929f, + 0.15859416f, -0.040437475f, 0.050779544f, -0.022187516f, 0.012166504f, + 0.027685808f, -0.07675938f, -0.0055694645f, -0.09444123f, 0.0046453946f, + 0.050794356f, 0.10770313f, -0.20790008f, -0.07149004f, -0.11425117f, + 0.008225835f, -0.035802525f, 0.14374903f, 0.15262283f, 0.048710253f, + 0.1847461f, -0.007487823f, 0.11000021f, -0.09542012f, 0.22619456f, + -0.029149994f, 0.08527916f, 0.009043713f, 0.0042746216f, 0.016261552f, + 0.022461696f, 0.12689082f, -0.043589946f, -0.12035478f, -0.08361797f, + -0.050666027f, -0.1248618f, -0.1275799f, -0.071875185f, 0.07377272f, + 0.09944291f, -0.18897448f, -0.1593054f, -0.06526116f, -0.040107165f, + -0.004618631f, -0.067624845f, -0.007576253f, 0.10727444f, 0.041546922f, + -0.20424393f, 0.06907816f, 0.050412357f, 0.00724631f, 0.039827548f, + 0.12449835f, 0.10747581f, 0.13708383f, 0.09134148f, -0.12617786f, + -0.06428341f, 0.09956831f, 0.1208086f, -0.14676677f, -0.0727722f, + 0.1126304f, 0.010139365f, 0.015571211f, -0.038128063f, 0.022913318f, + -0.042050496f, 0.16842307f, -0.060597885f, 0.10531834f, -0.06411776f, + -0.07451711f, -0.03410368f, -0.13393489f, 0.06534304f, 0.003620307f, + 0.04490757f, 0.05970546f, 0.05197996f, 0.02839995f, 0.10434969f, + -0.013699693f, -0.028353551f, -0.07260381f, 0.047201227f, -0.024575593f, + -0.036445823f, 0.07155557f, 0.009672501f, -0.02328883f, 0.009533515f, + -0.03606021f, -0.07421458f, -0.028082801f, -0.2678904f, -0.13221288f, + 0.18419984f, -0.13012612f, -0.014588381f, -0.035059117f, -0.04824723f, + 0.07830115f, -0.056184657f, 0.03277091f, 0.025466874f, 0.14494097f, + -0.12522776f, -0.098633975f, -0.10766018f, -0.08317623f, 0.08594209f, + 0.07749552f, 0.039474737f, 0.1776665f, -0.07409566f, -0.0477268f, + 0.29323658f, 0.10801441f, 0.1154011f, 0.013952499f, 0.10739139f, + 0.10708251f, -0.051456142f, 0.0074137426f, -0.10430189f, 0.10034707f, + 0.045594677f, 0.0635285f, -0.0715442f, -0.089667566f, -0.10811871f, + 0.00026344223f, 0.08298446f, -0.009525053f, 0.006585689f, -0.24567553f, + -0.09450807f, 0.09648481f, 0.026996298f, -0.06419476f, -0.04752702f, + -0.11063944f, -0.23441927f, -0.17608605f, -0.052156363f, 0.067035615f, + 0.19271925f, -0.0032889997f, -0.043264326f, 0.09663576f, -0.057112187f, + -0.10100678f, 0.0628376f, 0.04447668f, 0.017961001f, -0.10094388f, + -0.10190601f, 0.18335468f, 0.10494553f, -0.052095775f, -0.0026118709f, + 0.10539724f, -0.04383912f, -0.042349473f, 0.08438151f, -0.1947263f, + 0.02251204f, 0.11216432f, -0.10307853f, 0.17351969f, -0.039091777f, + 0.08066188f, -0.00561982f, 0.12633002f, 0.11335965f, -0.0088127935f, + -0.019777594f, 0.06864014f, -0.059751723f, 0.016233567f, -0.06894641f, + -0.28651384f, -0.004228674f, 0.019708522f, -0.16305895f, -0.07468996f, + -0.0855457f, 0.099339016f, -0.07580735f, -0.13775392f, 0.08434318f, + 0.08330512f, -0.12131499f, 0.031935584f, 0.09180414f, -0.08876437f, + -0.08049874f, 0.008753825f, 0.03498998f, 0.030215185f, 0.03907079f, + 0.089751154f, 0.029194152f, -0.03337423f, -0.019092513f, 0.04331237f, + 0.04299654f, -0.036394123f, -0.12915532f, 0.09793732f, 0.07512415f, + -0.11319543f, -0.032502122f, 0.15661901f, 0.07671967f, -0.005491124f, + -0.19379048f, -0.218606f, 0.21448623f, 0.017840758f, 0.1416943f, + -0.07051762f, 0.19488361f, 0.02664691f, -0.18104725f, -0.09334311f, + 0.15026465f, -0.15493552f, -0.057762887f, -0.11604192f, -0.262013f, + -0.01391798f, 0.012185008f, 0.11156489f, -0.07483202f, 0.06693364f, + -0.26151478f, 0.046425626f, 0.036540434f, -0.16435726f, 0.17338543f, + -0.21401681f, -0.11385144f, -0.08283257f, -0.069031075f, 0.030635102f, + 0.010969227f, 0.11109743f, 0.010919218f, 0.027526086f, 0.13519906f, + 0.01891392f, -0.046839405f, -0.040167913f, 0.017953383f, -0.09700955f, + 0.0061885654f, -0.07000971f, 0.026893595f, -0.038844477f, 0.14543656f + }); + + std::vector projectionBiasVector(outputSize, 0.f); + auto projectionBias = MakeTensor(tensorInfo16, projectionBiasVector); + + armnn::ScopedCpuTensorHandle inputToInputWeightsTensor(tensorInfo20x5); + armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo20x5); + armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo20x5); + armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo20x5); + armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo20x16); + armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo20x16); + armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo20x16); + armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo20x16); + armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle inputGateBiasTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle cellToForgetWeightsTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle cellToOutputWeightsTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle projectionWeightsTensor(tensorInfo16x20); + armnn::ScopedCpuTensorHandle projectionBiasTensor(tensorInfo16); + + AllocateAndCopyDataToITensorHandle(&inputToInputWeightsTensor, &inputToInputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToInputWeightsTensor, &recurrentToInputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&cellToInputWeightsTensor, &cellToInputWeights[0]); + AllocateAndCopyDataToITensorHandle(&inputGateBiasTensor, &inputGateBias[0]); + AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]); + AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]); + AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]); + AllocateAndCopyDataToITensorHandle(&cellToForgetWeightsTensor, &cellToForgetWeights[0]); + AllocateAndCopyDataToITensorHandle(&cellToOutputWeightsTensor, &cellToOutputWeights[0]); + AllocateAndCopyDataToITensorHandle(&projectionWeightsTensor, &projectionWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&projectionBiasTensor, &projectionBias[0]); + + data.m_InputToInputWeights = &inputToInputWeightsTensor; + data.m_InputToForgetWeights = &inputToForgetWeightsTensor; + data.m_InputToCellWeights = &inputToCellWeightsTensor; + data.m_InputToOutputWeights = &inputToOutputWeightsTensor; + data.m_RecurrentToInputWeights = &recurrentToInputWeightsTensor; + data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor; + data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor; + data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor; + data.m_CellToInputWeights = &cellToInputWeightsTensor; + data.m_InputGateBias = &inputGateBiasTensor; + data.m_ForgetGateBias = &forgetGateBiasTensor; + data.m_CellBias = &cellBiasTensor; + data.m_OutputGateBias = &outputGateBiasTensor; + data.m_CellToForgetWeights = &cellToForgetWeightsTensor; + data.m_CellToOutputWeights = &cellToOutputWeightsTensor; + data.m_ProjectionWeights = &projectionWeightsTensor; + data.m_ProjectionBias = &projectionBiasTensor; + + // Flags to set test configuration + data.m_Parameters.m_ActivationFunc = 4; + data.m_Parameters.m_CifgEnabled = false; + data.m_Parameters.m_PeepholeEnabled = true; + data.m_Parameters.m_ProjectionEnabled = true; + + + std::unique_ptr workload = workloadFactory.CreateLstm(data, info); + inputHandle->Allocate(); + outputStateInHandle->Allocate(); + cellStateInHandle->Allocate(); + + scratchHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]); + CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]); + CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]); + + workloadFactory.Finalize(); + workload->Execute(); + + CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get()); + + return ret; + +} + + +LayerTestResult LstmLayerWithCifgWithPeepholeNoProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory, + const boost::multi_array& input, + const boost::multi_array& outputExpected) +{ + bool cifgEnabled = true; + bool peepholeEnabled = true; + bool projectionEnabled = false; + // These are not the input and the output of Lstm yet + unsigned int batchSize = boost::numeric_cast(input.shape()[0]); + unsigned int inputSize = boost::numeric_cast(input.shape()[1]); + + unsigned int outputSize = boost::numeric_cast(outputExpected.shape()[1]); + + const unsigned int cellSize = outputSize; + + // Decide the shape of all input tensors + armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, armnn::GetDataType()); + armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, armnn::GetDataType()); + armnn::TensorInfo cellStateInTensorInfo({batchSize, cellSize}, armnn::GetDataType()); + + unsigned int scratchBufferSize = cifgEnabled ? cellSize * 4 : cellSize * 3; + armnn::TensorInfo scratchBufferTensorInfo({batchSize, scratchBufferSize}, armnn::GetDataType()); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType()); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, cellSize}, armnn::GetDataType()); + armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType()); + + // List of inputs + std::vector inputData; + inputData.assign(input.data(), input.data() + batchSize*inputSize); + auto inputTensor = MakeTensor(inputTensorInfo, inputData); + + std::vector outputStateInVector(batchSize * outputSize, 0.f); + auto outputStateInTensor = MakeTensor(outputStateInTensorInfo, outputStateInVector); + + std::vector cellStateInVector(batchSize * cellSize, 0.f); + auto cellStateInTensor = MakeTensor(cellStateInTensorInfo, cellStateInVector); + + + // Prepare all the weights in the descriptor for LSTM + armnn::LstmQueueDescriptor data; + armnn::TensorInfo tensorInfoInput({cellSize, inputSize}, armnn::GetDataType()); + armnn::TensorInfo tensorInfoOutput({cellSize, outputSize}, armnn::GetDataType()); + armnn::TensorInfo tensorInfoNumUnits({cellSize}, armnn::GetDataType()); + + auto inputToCellWeights = MakeTensor(tensorInfoInput, + {-0.49770179f, -0.27711356f, -0.09624726f, 0.05100781f, + 0.04717243f, 0.48944736f, -0.38535351f, + -0.17212132f}); + auto inputToForgetWeights = MakeTensor(tensorInfoInput, + {-0.55291498f, -0.42866567f, 0.13056988f, + -0.3633365f, -0.22755712f, 0.28253698f, 0.24407166f, + 0.33826375f}); + auto inputToOutputWeights = MakeTensor(tensorInfoInput, + {0.10725588f, -0.02335852f, -0.55932593f, + -0.09426838f, -0.44257352f, 0.54939759f, + 0.01533556f, 0.42751634f}); + auto cellBias = MakeTensor(tensorInfoNumUnits, {0.f, 0.f, 0.f, 0.f}); + auto forgetGateBias = MakeTensor(tensorInfoNumUnits, {1.f, 1.f, 1.f, 1.f}); + auto outputGateBias = MakeTensor(tensorInfoNumUnits, {0.f, 0.f, 0.f, 0.f}); + + auto recurrentToCellWeights = MakeTensor(tensorInfoOutput, + {0.54066205f, -0.32668582f, -0.43562764f, -0.56094903f, 0.42957711f, + 0.01841056f, -0.32764608f, -0.33027974f, -0.10826075f, 0.20675004f, + 0.19069612f, -0.03026325f, -0.54532051f, 0.33003211f, 0.44901288f, + 0.21193194f}); + auto recurrentToForgetWeights = MakeTensor(tensorInfoOutput, + {-0.13832897f, -0.0515101f, -0.2359007f, -0.16661474f, -0.14340827f, + 0.36986142f, 0.23414481f, 0.55899f, 0.10798943f, -0.41174671f, 0.17751795f, + -0.34484994f, -0.35874045f, -0.11352962f, 0.27268326f, 0.54058349f}); + + auto recurrentToOutputWeights = MakeTensor(tensorInfoOutput, + {0.41613156f, 0.42610586f, -0.16495961f, -0.5663873f, 0.30579174f, -0.05115908f, + -0.33941799f, 0.23364776f, 0.11178309f, 0.09481031f, -0.26424935f, 0.46261835f, + 0.50248802f, 0.26114327f, -0.43736315f, 0.33149987f}); + + auto cellToForgetWeights = MakeTensor(tensorInfoNumUnits, + {0.47485286f, -0.51955009f, -0.24458408f, 0.31544167f}); + auto cellToOutputWeights = MakeTensor(tensorInfoNumUnits, + {-0.17135078f, 0.82760304f, 0.85573703f, -0.77109635f}); + + armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfoInput); + armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfoInput); + armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfoInput); + + armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfoNumUnits); + armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfoNumUnits); + armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfoNumUnits); + + armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfoOutput); + armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfoOutput); + armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfoOutput); + + + armnn::ScopedCpuTensorHandle cellToForgetWeightsTensor(tensorInfoNumUnits); + armnn::ScopedCpuTensorHandle cellToOutputWeightsTensor(tensorInfoNumUnits); + + AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]); + + AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]); + AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]); + AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]); + + AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]); + + AllocateAndCopyDataToITensorHandle(&cellToForgetWeightsTensor, &cellToForgetWeights[0]); + AllocateAndCopyDataToITensorHandle(&cellToOutputWeightsTensor, &cellToOutputWeights[0]); + + + data.m_InputToCellWeights = &inputToCellWeightsTensor; + data.m_InputToForgetWeights = &inputToForgetWeightsTensor; + data.m_InputToOutputWeights = &inputToOutputWeightsTensor; + + data.m_CellBias = &cellBiasTensor; + data.m_ForgetGateBias = &forgetGateBiasTensor; + data.m_OutputGateBias = &outputGateBiasTensor; + + data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor; + data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor; + data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor; + + data.m_CellToForgetWeights = &cellToForgetWeightsTensor; + data.m_CellToOutputWeights = &cellToOutputWeightsTensor; + + // other parameters for the descriptor + data.m_Parameters.m_CifgEnabled = cifgEnabled; + data.m_Parameters.m_ProjectionEnabled = projectionEnabled; + data.m_Parameters.m_PeepholeEnabled = peepholeEnabled; + + data.m_Parameters.m_ActivationFunc = 4; + data.m_Parameters.m_ClippingThresProj = 0.0; + data.m_Parameters.m_ClippingThresCell = 0.0; + + + // List of outputs + std::vector scratchBufferVector(batchSize * scratchBufferSize, 0.f); + auto scratchBufferTensor = MakeTensor(scratchBufferTensorInfo, scratchBufferVector); + LayerTestResult ret0(scratchBufferTensorInfo); + + // Output state for a certain time step + std::vector outputStateOutVector(batchSize * outputSize, 0.f); + auto outputStateOutTensor = MakeTensor(outputStateOutTensorInfo, outputStateOutVector); + LayerTestResult ret1(outputStateOutTensorInfo); + + // Cell state for a certain time step + std::vector cellStateOutVector(batchSize * cellSize, 0.f); + auto cellStateOutTensor = MakeTensor(cellStateOutTensorInfo, cellStateOutVector); + LayerTestResult ret2(cellStateOutTensorInfo); + + // Output for a certain time step + std::vector outputVector(batchSize * outputSize, 0.f); + auto outputTensor = MakeTensor(outputTensorInfo, outputVector); + std::vector outputData; + outputData.assign(outputExpected.data(), outputExpected.data() + batchSize*outputSize); + LayerTestResult ret3(outputTensorInfo); + ret3.outputExpected = MakeTensor(outputTensorInfo, outputData); + + // Prepare the inputs and outputs for the workload + std::unique_ptr inputHandle = + workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr outputStateInHandle = + workloadFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr cellStateInHandle = + workloadFactory.CreateTensorHandle(cellStateInTensorInfo); + + std::unique_ptr scratchBufferHandle = + workloadFactory.CreateTensorHandle(scratchBufferTensorInfo); + std::unique_ptr outputStateOutHandle = + workloadFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr cellStateOutHandle = + workloadFactory.CreateTensorHandle(cellStateOutTensorInfo); + std::unique_ptr outputHandle = + workloadFactory.CreateTensorHandle(outputTensorInfo); + + armnn::WorkloadInfo info; + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); + AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + + AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchBufferHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + + std::unique_ptr workload = workloadFactory.CreateLstm(data, info); + + + inputHandle->Allocate(); + outputStateInHandle->Allocate(); + cellStateInHandle->Allocate(); + + scratchBufferHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); + outputHandle->Allocate(); + + + CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]); + CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]); + CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]); + + CopyDataToITensorHandle(scratchBufferHandle.get(), &scratchBufferTensor[0][0]); + CopyDataToITensorHandle(outputStateOutHandle.get(), &outputStateOutTensor[0][0]); + CopyDataToITensorHandle(cellStateOutHandle.get(), &cellStateOutTensor[0][0]); + + workloadFactory.Finalize(); + workload->Execute(); + + CopyDataFromITensorHandle(&ret0.output[0][0], scratchBufferHandle.get()); + CopyDataFromITensorHandle(&ret1.output[0][0], outputStateOutHandle.get()); + CopyDataFromITensorHandle(&ret2.output[0][0], cellStateOutHandle.get()); + CopyDataFromITensorHandle(&ret3.output[0][0], outputHandle.get()); + + return ret3; +} diff --git a/src/armnn/backends/test/MemCopyTests.cpp b/src/armnn/backends/test/MemCopyTests.cpp index 32331789e9..24a951c395 100644 --- a/src/armnn/backends/test/MemCopyTests.cpp +++ b/src/armnn/backends/test/MemCopyTests.cpp @@ -19,6 +19,10 @@ #include "TensorCopyUtils.hpp" #include "WorkloadTestUtils.hpp" +#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED +#include "../ArmComputeTensorUtils.hpp" +#endif + BOOST_AUTO_TEST_SUITE(MemCopyTestSuite) void MemCopyTest(armnn::IWorkloadFactory& srcWorkloadFactory, armnn::IWorkloadFactory& dstWorkloadFactory, @@ -81,6 +85,26 @@ void MemCopyTest(bool withSubtensors) MemCopyTest(srcWorkloadFactory, dstWorkloadFactory, withSubtensors); } +#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED + +BOOST_AUTO_TEST_CASE(AclTypeConversions) +{ + arm_compute::Strides strides(1,2,3,4); + armnn::TensorShape convertedStrides = armnn::armcomputetensorutils::GetStrides(strides); + BOOST_TEST(convertedStrides[0] == 4); + BOOST_TEST(convertedStrides[1] == 3); + BOOST_TEST(convertedStrides[2] == 2); + BOOST_TEST(convertedStrides[3] == 1); + + arm_compute::TensorShape shape(5,6,7,8); + armnn::TensorShape convertedshape = armnn::armcomputetensorutils::GetShape(shape); + BOOST_TEST(convertedshape[0] == 8); + BOOST_TEST(convertedshape[1] == 7); + BOOST_TEST(convertedshape[2] == 6); + BOOST_TEST(convertedshape[3] == 5); +} +#endif + #if ARMCOMPUTECL_ENABLED BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpu) diff --git a/src/armnn/backends/test/NormTestImpl.hpp b/src/armnn/backends/test/NormTestImpl.hpp index d9dc01592a..df8219ddbd 100644 --- a/src/armnn/backends/test/NormTestImpl.hpp +++ b/src/armnn/backends/test/NormTestImpl.hpp @@ -87,7 +87,7 @@ LayerTestResult SimpleNormalizationTestImpl(armnn::IWorkloadFactory& wo // When normalising within channels, the 3x3 kernel covers the entire 2x2 input at every index. // Therefore, all output values should equal the inputs, but divided by: // pow((kappa + (accumulatedScale * alpha)), beta) - // ...where accumulatedScale is the sum of every element squared + // ...where accumulatedScale is the sum of every element squared. float divisor[inputNum]; for(int i = 0; i < boost::numeric_cast(inputNum); i++) { @@ -139,7 +139,7 @@ LayerTestResult SimpleNormalizationTestImpl(armnn::IWorkloadFactory& wo } break; } - case armnn::NormalizationAlgorithmMethod::LocalContrast: // NOTE: intentional fallthrough + case armnn::NormalizationAlgorithmMethod::LocalContrast: // NOTE: intentional fallthrough. default: { throw armnn::UnimplementedException("Unsupported normalisation method type, " diff --git a/src/armnn/backends/test/Pooling2dTestImpl.hpp b/src/armnn/backends/test/Pooling2dTestImpl.hpp index ab9fd6d6fb..e6e0e6721a 100644 --- a/src/armnn/backends/test/Pooling2dTestImpl.hpp +++ b/src/armnn/backends/test/Pooling2dTestImpl.hpp @@ -155,21 +155,21 @@ LayerTestResult SimpleMaxPooling2dSize3x3Stride2x4TestCommon(armnn::IWorkl 3.0f, 5.0f, 4.0f, 0.0f, 1.0f, 5.0f, 9.0f, 7.0f, }); - // Construct input data + // Constructs input data. std::vector inputData; auto negator = [](float f) { return -f; }; - // First image (two channels where the second channel is the negative of the first one) + // First image (two channels where the second channel is the negative of the first one). inputData.insert(inputData.end(), singleChannelData.begin(), singleChannelData.end()); std::transform(singleChannelData.begin(), singleChannelData.end(), std::back_inserter(inputData), negator); - // Second image (same as first image) + // Second image (same as first image). inputData.insert(inputData.end(), singleChannelData.begin(), singleChannelData.end()); std::transform(singleChannelData.begin(), singleChannelData.end(), std::back_inserter(inputData), negator); auto input = MakeTensor(inputTensorInfo, QuantizedVector(qScale, qOffset, inputData)); - // these were calculated manually + // These were calculated manually. auto shape(GetTensorShapeAsArray<4>(outputTensorInfo)); boost::multi_array outputExpected(shape); if (forceNoPadding) @@ -527,13 +527,13 @@ LayerTestResult AsymmetricNonSquarePooling2dTestCommon(armnn::IWorkloadFac descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor; descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude; - // Construct input data + // Construct input data. auto input = MakeTensor(inputTensorInfo, QuantizedVector(qScale, qOffset, { 1.0f, 3.0f, 4.0f, })); - // these were calculated manually + // These were calculated manually. auto outputExpected = MakeTensor(outputTensorInfo, QuantizedVector(qScale, qOffset, { 0.0f, 3.0f, 0.0f, 3.0f, @@ -686,7 +686,7 @@ LayerTestResult SimpleMaxPooling2dSize2x2Stride2x2TestCommon(armnn::IWorkl 438.0f, 564.0f, 573.0f, 402.0f }; - // Note that left and right edges will be 0.f, due to the 2x2 max pooling only accessing zeros here + // Note that left and right edges will be 0.f, due to the 2x2 max pooling only accessing zeros here. std::vector expectedOutputDataWithPadding = { 0.0f, 510.0f, 780.0f, 654.0f, 0.0f, 0.0f, 438.0f, 618.0f, 402.0f, 0.0f diff --git a/src/armnn/backends/test/QuantizeHelper.hpp b/src/armnn/backends/test/QuantizeHelper.hpp index bfaf9342f0..0a6ceb761d 100644 --- a/src/armnn/backends/test/QuantizeHelper.hpp +++ b/src/armnn/backends/test/QuantizeHelper.hpp @@ -61,7 +61,7 @@ struct IsFloatingPointIterator }; template ::value, int>::type=0 // Make sure valid fp iterator +typename std::enable_if::value, int>::type=0 // Makes sure fp iterator is valid. > std::vector QuantizedVector(float qScale, int32_t qOffset, FloatIt first, FloatIt last) { diff --git a/src/armnn/backends/test/Reference.cpp b/src/armnn/backends/test/Reference.cpp index b60483a4d9..dedeb50e33 100644 --- a/src/armnn/backends/test/Reference.cpp +++ b/src/armnn/backends/test/Reference.cpp @@ -127,25 +127,8 @@ ARMNN_AUTO_TEST_CASE(FullyConnectedLarge, FullyConnectedLargeTest, false) ARMNN_AUTO_TEST_CASE(FullyConnectedLargeTransposed, FullyConnectedLargeTest, true) // Splitter -BOOST_AUTO_TEST_CASE(SimpleSplitter) -{ - armnn::RefWorkloadFactory workloadFactory; - auto testResult = SplitterTest(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } -} - -BOOST_AUTO_TEST_CASE(SplitterUint8) -{ - armnn::RefWorkloadFactory workloadFactory; - auto testResult = SplitterUint8Test(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } -} +ARMNN_AUTO_TEST_CASE(SimpleSplitter, SplitterTest) +ARMNN_AUTO_TEST_CASE(SimpleSplitterUint8, SplitterUint8Test) ARMNN_AUTO_TEST_CASE(CopyViaSplitter, CopyViaSplitterTest) ARMNN_AUTO_TEST_CASE(CopyViaSplitterUint8, CopyViaSplitterUint8Test) @@ -242,4 +225,9 @@ ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet1, PermuteFloat32ValueSet1Test) ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet2, PermuteFloat32ValueSet2Test) ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet3, PermuteFloat32ValueSet3Test) +// Convert from Float16 to Float32 +ARMNN_AUTO_TEST_CASE(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Test) +// Convert from Float32 to Float16 +ARMNN_AUTO_TEST_CASE(SimpleConvertFp32ToFp16, SimpleConvertFp32ToFp16Test) + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/SoftmaxTestImpl.hpp b/src/armnn/backends/test/SoftmaxTestImpl.hpp index 4c3e0b73dd..9ed7f603a1 100644 --- a/src/armnn/backends/test/SoftmaxTestImpl.hpp +++ b/src/armnn/backends/test/SoftmaxTestImpl.hpp @@ -39,7 +39,7 @@ LayerTestResult SimpleSoftmaxTestImpl(armnn::IWorkloadFactory& workloadFac LayerTestResult ret(outputTensorInfo); - // Each row is independently softmax'd + // Each row is independently softmax'd. auto input = MakeTensor(inputTensorInfo, std::vector( QuantizedVector(qScale, 0, { 0.f, 1.f, 0.f, 0.f, diff --git a/src/armnn/backends/test/SplitterTestImpl.hpp b/src/armnn/backends/test/SplitterTestImpl.hpp index 70b798eafa..48c0730fa7 100644 --- a/src/armnn/backends/test/SplitterTestImpl.hpp +++ b/src/armnn/backends/test/SplitterTestImpl.hpp @@ -27,35 +27,35 @@ std::vector> SplitterTestCommon(armnn::IWorkloadFactory& wo // NOTE: Compute Library imposes a restriction that the x and y dimension (input height and width) // cannot be split. - // For the reasons for this see first comment on https://jira.arm.com/browse/IVGCVSW-1239 + // For the reasons for this, see first comment on https://jira.arm.com/browse/IVGCVSW-1239 // - // this test has therefore been recast to split the channels, then split the resulting subtensor + // This test has therefore been recast to split the channels, then split the resulting subtensor. - // to take channel 0 of original output - // and channel 0 and channel 1 of the split subtensor + // To take channel 0 of original output + // and channel 0 and channel 1 of the split subtensor. unsigned int outputWidth1 = inputWidth; unsigned int outputHeight1 = inputHeight; unsigned int outputChannels1 = 1; - // to take channel 1 and 2 of the original output + // To take channel 1 and 2 of the original output. unsigned int outputWidth2 = inputWidth; unsigned int outputHeight2 = inputHeight; unsigned int outputChannels2 = 2; - // Define the tensor descriptors + // Define the tensor descriptors. armnn::TensorInfo inputTensorInfo({ inputChannels, inputHeight, inputWidth }, armnn::GetDataType()); - // outputs of the original split + // Outputs of the original split. armnn::TensorInfo outputTensorInfo1({ outputChannels1, outputHeight1, outputWidth1 }, armnn::GetDataType()); armnn::TensorInfo outputTensorInfo2({ outputChannels2, outputHeight2, outputWidth2 }, armnn::GetDataType()); - // outputs of the subsequent subtensor split + // Outputs of the subsequent subtensor split. armnn::TensorInfo outputTensorInfo3({ outputChannels1, outputHeight1, outputWidth1 }, armnn::GetDataType()); armnn::TensorInfo outputTensorInfo4({ outputChannels1, outputHeight1, outputWidth1 }, armnn::GetDataType()); // Set quantization parameters if the requested type is a quantized type. - // The quantization doesn't really matter as the splitter operator doesn't dequantize/quantize + // The quantization doesn't really matter as the splitter operator doesn't dequantize/quantize. if(armnn::IsQuantizedType()) { inputTensorInfo.SetQuantizationScale(qScale); @@ -100,7 +100,7 @@ std::vector> SplitterTestCommon(armnn::IWorkloadFactory& wo }) )); - // channel 0 of the original input + // Channel 0 of the original input. ret1.outputExpected = MakeTensor(outputTensorInfo1, std::vector( QuantizedVector(qScale, qOffset, { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, @@ -112,7 +112,7 @@ std::vector> SplitterTestCommon(armnn::IWorkloadFactory& wo }) )); - // channel 1 & 2 of the original input + // Channel 1 & 2 of the original input. ret2.outputExpected = MakeTensor(outputTensorInfo2, std::vector( QuantizedVector(qScale, qOffset, { 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, @@ -131,7 +131,7 @@ std::vector> SplitterTestCommon(armnn::IWorkloadFactory& wo }) )); - // channel 0 of return 2 (i.e. channels 1 and 2 of the original input) + // Channel 0 of return 2 (i.e. channels 1 and 2 of the original input). ret3.outputExpected = MakeTensor(outputTensorInfo3, std::vector( QuantizedVector(qScale, qOffset, { 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, @@ -143,7 +143,7 @@ std::vector> SplitterTestCommon(armnn::IWorkloadFactory& wo }) )); - // channel 1 of return 2 + // Channel 1 of return 2. ret4.outputExpected = MakeTensor(outputTensorInfo4, std::vector( QuantizedVector(qScale, qOffset, { 61.0f, 62.0f, 63.0f, 64.0f, 65.0f, @@ -155,19 +155,19 @@ std::vector> SplitterTestCommon(armnn::IWorkloadFactory& wo }) )); - // NOTE: as a corollary of the no splitting of x and y restriction the x and y values of the view origins + // NOTE: as a corollary of the splitting of x and y restriction the x and y values of the view origins // have to be zero, the co-ordinates are as per the tensor info above channels, height/y, width/x - // note that under the hood the compute engine reverses these i.e. its coordinate system is x, y, channels - std::vector wOrigin1 = {0, 0, 0}; //extent of the window is defined by size of output[0] + // note that under the hood the compute engine reverses these i.e. its coordinate system is x, y, channels. + std::vector wOrigin1 = {0, 0, 0}; //Extent of the window is defined by size of output[0]. armnn::SplitterQueueDescriptor::ViewOrigin window1(wOrigin1); - std::vector wOrigin2 = {1, 0, 0}; //extent of the window is defined by size of output[1] + std::vector wOrigin2 = {1, 0, 0}; //Extent of the window is defined by size of output[1]. armnn::SplitterQueueDescriptor::ViewOrigin window2(wOrigin2); - std::vector wOrigin3 = {0, 0, 0}; //extent of the window is defined by size of output[2] + std::vector wOrigin3 = {0, 0, 0}; //Extent of the window is defined by size of output[2]. armnn::SplitterQueueDescriptor::ViewOrigin window3(wOrigin3); - std::vector wOrigin4 = {1, 0, 0}; //extent of the window is defined by size of output[3] + std::vector wOrigin4 = {1, 0, 0}; //Extent of the window is defined by size of output[3]. armnn::SplitterQueueDescriptor::ViewOrigin window4(wOrigin4); bool subTensorsSupported = workloadFactory.SupportsSubTensors(); @@ -217,7 +217,7 @@ std::vector> SplitterTestCommon(armnn::IWorkloadFactory& wo CopyDataFromITensorHandle(&ret1.output[0][0][0], outputHandle1.get()); CopyDataFromITensorHandle(&ret2.output[0][0][0], outputHandle2.get()); -// // Do the second split +// // Do the second split. armnn::SplitterQueueDescriptor data2; armnn::WorkloadInfo info2; AddInputToWorkload(data2, info2, outputTensorInfo2, outputHandle2.get()); diff --git a/src/armnn/backends/test/TensorCopyUtils.cpp b/src/armnn/backends/test/TensorCopyUtils.cpp index e15c12a76f..82e80a52fe 100644 --- a/src/armnn/backends/test/TensorCopyUtils.cpp +++ b/src/armnn/backends/test/TensorCopyUtils.cpp @@ -6,6 +6,7 @@ #include #include #include +#include #include "TensorCopyUtils.hpp" @@ -47,12 +48,15 @@ void CopyDataToITensorHandle(armnn::ITensorHandle* tensorHandle, const void* mem case arm_compute::DataType::QASYMM8: CopyArmComputeITensorData(static_cast(mem), handle->GetTensor()); break; + case arm_compute::DataType::F16: + CopyArmComputeITensorData(static_cast(mem), handle->GetTensor()); + break; default: { throw armnn::UnimplementedException(); } } - handle->UnMap(); + handle->Unmap(); break; } #endif @@ -108,12 +112,15 @@ void CopyDataFromITensorHandle(void* mem, const armnn::ITensorHandle* tensorHand case arm_compute::DataType::QASYMM8: CopyArmComputeITensorData(handle->GetTensor(), static_cast(mem)); break; + case arm_compute::DataType::F16: + CopyArmComputeITensorData(handle->GetTensor(), static_cast(mem)); + break; default: { throw armnn::UnimplementedException(); } } - const_cast(handle)->UnMap(); + const_cast(handle)->Unmap(); break; } #endif diff --git a/src/armnn/backends/test/WorkloadDataValidation.cpp b/src/armnn/backends/test/WorkloadDataValidation.cpp index c3a9d40116..bc3898b405 100644 --- a/src/armnn/backends/test/WorkloadDataValidation.cpp +++ b/src/armnn/backends/test/WorkloadDataValidation.cpp @@ -22,7 +22,7 @@ BOOST_AUTO_TEST_CASE(QueueDescriptor_Validate_WrongNumOfInputsOutputs) { InputQueueDescriptor invalidData; WorkloadInfo invalidInfo; - //invalid argument exception is expected, because no inputs and no outputs were defined + //Invalid argument exception is expected, because no inputs and no outputs were defined. BOOST_CHECK_THROW(RefWorkloadFactory().CreateInput(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -31,7 +31,7 @@ BOOST_AUTO_TEST_CASE(RefPooling2dFloat32Workload_Validate_WrongDimTensor) armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; - unsigned int inputShape[] = {2, 3, 4}; // <- invalid - input tensor has to be 4D + unsigned int inputShape[] = {2, 3, 4}; // <- Invalid - input tensor has to be 4D. unsigned int outputShape[] = {2, 3, 4, 5}; outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::DataType::Float32); @@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE(RefPooling2dFloat32Workload_Validate_WrongDimTensor) AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); - // invalid argument exception is expected, input tensor has to be 4D + // Invalid argument exception is expected, input tensor has to be 4D. BOOST_CHECK_THROW(RefPooling2dFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -55,7 +55,7 @@ BOOST_AUTO_TEST_CASE(SoftmaxQueueDescriptor_Validate_WrongInputHeight) unsigned int inputNum = 2; unsigned int outputChannels = inputChannels; - unsigned int outputHeight = inputHeight + 1; //makes data invalid - Softmax expects height and width to be 1 + unsigned int outputHeight = inputHeight + 1; //Makes data invalid - Softmax expects height and width to be 1. unsigned int outputWidth = inputWidth; unsigned int outputNum = inputNum; @@ -74,7 +74,7 @@ BOOST_AUTO_TEST_CASE(SoftmaxQueueDescriptor_Validate_WrongInputHeight) AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - //invalid argument exception is expected, because height != 1 + //Invalid argument exception is expected, because height != 1. BOOST_CHECK_THROW(RefSoftmaxFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -90,7 +90,7 @@ BOOST_AUTO_TEST_CASE(FullyConnectedQueueDescriptor_Validate_RequiredDataMissing) unsigned int outputChannels = 3; unsigned int outputNum = 2; - // Define the tensor descriptors + // Define the tensor descriptors. armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; armnn::TensorInfo weightsDesc; @@ -120,8 +120,8 @@ BOOST_AUTO_TEST_CASE(FullyConnectedQueueDescriptor_Validate_RequiredDataMissing) invalidData.m_Parameters.m_TransposeWeightMatrix = false; - //invalid argument exception is expected, because not all required fields have been provided - //in particular inputsData[0], outputsData[0] and weightsData can not be null + //Invalid argument exception is expected, because not all required fields have been provided. + //In particular inputsData[0], outputsData[0] and weightsData can not be null. BOOST_CHECK_THROW(RefFullyConnectedFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -135,8 +135,8 @@ BOOST_AUTO_TEST_CASE(NormalizationQueueDescriptor_Validate_WrongInputHeight) constexpr unsigned int outputNum = inputNum; constexpr unsigned int outputChannels = inputChannels; - constexpr unsigned int outputHeight = inputHeight + 1; //makes data invalid - normalization requires - //input and output to have the same dimensions + constexpr unsigned int outputHeight = inputHeight + 1; //Makes data invalid - normalization requires. + //Input and output to have the same dimensions. constexpr unsigned int outputWidth = inputWidth; @@ -169,7 +169,7 @@ BOOST_AUTO_TEST_CASE(NormalizationQueueDescriptor_Validate_WrongInputHeight) invalidData.m_Parameters.m_Beta = beta; invalidData.m_Parameters.m_K = kappa; - //invalid argument exception is expected, because input height != output height + //Invalid argument exception is expected, because input height != output height. BOOST_CHECK_THROW(RefNormalizationFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -201,7 +201,7 @@ BOOST_AUTO_TEST_CASE(SplitterQueueDescriptor_Validate_WrongWindow) AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - // invalid since it has only 3 dimensions while the input tensor is 4d + // Invalid, since it has only 3 dimensions while the input tensor is 4d. std::vector wOrigin = {0, 0, 0}; armnn::SplitterQueueDescriptor::ViewOrigin window(wOrigin); invalidData.m_ViewOrigins.push_back(window); @@ -210,7 +210,7 @@ BOOST_AUTO_TEST_CASE(SplitterQueueDescriptor_Validate_WrongWindow) "match input."); BOOST_CHECK_THROW(RefSplitterFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); - // invalid since window extends past the boundary of input tensor + // Invalid, since window extends past the boundary of input tensor. std::vector wOrigin3 = {0, 0, 15, 0}; armnn::SplitterQueueDescriptor::ViewOrigin window3(wOrigin3); invalidData.m_ViewOrigins[0] = window3; @@ -259,7 +259,7 @@ BOOST_AUTO_TEST_CASE(MergerQueueDescriptor_Validate_WrongWindow) AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - // invalid since it has only 3 dimensions while the input tensor is 4d + // Invalid, since it has only 3 dimensions while the input tensor is 4d. std::vector wOrigin = {0, 0, 0}; armnn::MergerQueueDescriptor::ViewOrigin window(wOrigin); invalidData.m_ViewOrigins.push_back(window); @@ -268,7 +268,7 @@ BOOST_AUTO_TEST_CASE(MergerQueueDescriptor_Validate_WrongWindow) "match input."); BOOST_CHECK_THROW(RefMergerFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); - // invalid since window extends past the boundary of output tensor + // Invalid, since window extends past the boundary of output tensor. std::vector wOrigin3 = {0, 0, 15, 0}; armnn::MergerQueueDescriptor::ViewOrigin window3(wOrigin3); invalidData.m_ViewOrigins[0] = window3; @@ -308,17 +308,17 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputNumbers) AddInputToWorkload(invalidData, invalidInfo, input1TensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - // too few inputs + // Too few inputs. BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); AddInputToWorkload(invalidData, invalidInfo, input2TensorInfo, nullptr); - // correct + // Correct. BOOST_CHECK_NO_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo)); AddInputToWorkload(invalidData, invalidInfo, input3TensorInfo, nullptr); - // too many inputs + // Too many inputs. BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -331,7 +331,7 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputShapes) unsigned int shape1[] = {1, 1, 2, 1}; unsigned int shape2[] = {1, 1, 3, 2}; - // Incompatible shapes even with broadcasting + // Incompatible shapes even with broadcasting. { input1TensorInfo = armnn::TensorInfo(4, shape1, armnn::DataType::Float32); input2TensorInfo = armnn::TensorInfo(4, shape2, armnn::DataType::Float32); @@ -347,7 +347,7 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputShapes) BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } - // Output size not compatible with input sizes + // Output size not compatible with input sizes. { input1TensorInfo = armnn::TensorInfo(4, shape1, armnn::DataType::Float32); input2TensorInfo = armnn::TensorInfo(4, shape1, armnn::DataType::Float32); @@ -360,7 +360,7 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputShapes) AddInputToWorkload(invalidData, invalidInfo, input2TensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - // output differs + // Output differs. BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } } @@ -374,7 +374,7 @@ BOOST_AUTO_TEST_CASE(MultiplicationQueueDescriptor_Validate_InputTensorDimension constexpr unsigned int input0Shape[] = { 2, 2, 4, 4 }; constexpr std::size_t dimensionCount = std::extent::value; - // Check dimension consistency for input tensors + // Checks dimension consistency for input tensors. for (unsigned int dimIndex = 0; dimIndex < dimensionCount; ++dimIndex) { unsigned int input1Shape[dimensionCount]; @@ -399,7 +399,7 @@ BOOST_AUTO_TEST_CASE(MultiplicationQueueDescriptor_Validate_InputTensorDimension BOOST_CHECK_THROW(RefMultiplicationFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } - // Check dimension consistency for input and output tensors + // Checks dimension consistency for input and output tensors. for (unsigned int dimIndex = 0; dimIndex < dimensionCount; ++dimIndex) { unsigned int outputShape[dimensionCount]; @@ -430,7 +430,7 @@ BOOST_AUTO_TEST_CASE(ReshapeQueueDescriptor_Validate_MismatchingNumElements) armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; - // The input and output shapes should have the same number of elements, but these don't + // The input and output shapes should have the same number of elements, but these don't. unsigned int inputShape[] = { 1, 1, 2, 3 }; unsigned int outputShape[] = { 1, 1, 1, 2 }; @@ -443,8 +443,29 @@ BOOST_AUTO_TEST_CASE(ReshapeQueueDescriptor_Validate_MismatchingNumElements) AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - // InvalidArgumentException is expected, because the number of elements don't match + // InvalidArgumentException is expected, because the number of elements don't match. BOOST_CHECK_THROW(RefReshapeFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } + +BOOST_AUTO_TEST_CASE(LstmQueueDescriptor_Validate) +{ + armnn::TensorInfo inputTensorInfo; + armnn::TensorInfo outputTensorInfo; + + unsigned int inputShape[] = { 1, 2 }; + unsigned int outputShape[] = { 1 }; + + inputTensorInfo = armnn::TensorInfo(2, inputShape, armnn::DataType::Float32); + outputTensorInfo = armnn::TensorInfo(1, outputShape, armnn::DataType::Float32); + + LstmQueueDescriptor invalidData; + WorkloadInfo invalidInfo; + + AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); + AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); + + BOOST_CHECK_THROW(invalidData.Validate(invalidInfo), armnn::InvalidArgumentException); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/layers/ActivationLayer.cpp b/src/armnn/layers/ActivationLayer.cpp index 2371eaa97c..ad1e4a9eba 100644 --- a/src/armnn/layers/ActivationLayer.cpp +++ b/src/armnn/layers/ActivationLayer.cpp @@ -30,12 +30,16 @@ ActivationLayer* ActivationLayer::Clone(Graph& graph) const void ActivationLayer::ValidateTensorShapesFromInputs() { - auto& info = GetInputSlot(0).GetConnection()->GetTensorInfo(); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual( "ActivationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - info.GetShape()); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/AdditionLayer.cpp b/src/armnn/layers/AdditionLayer.cpp index 85d12eabcb..ab73a918db 100644 --- a/src/armnn/layers/AdditionLayer.cpp +++ b/src/armnn/layers/AdditionLayer.cpp @@ -28,41 +28,51 @@ AdditionLayer* AdditionLayer::Clone(Graph& graph) const return CloneBase(graph, GetName()); } -void AdditionLayer::ValidateTensorShapesFromInputs() +std::vector AdditionLayer::InferOutputShapes(const std::vector& inputShapes) const { - auto& input0 = GetInputSlot(0).GetConnection()->GetTensorInfo(); - auto& input1 = GetInputSlot(1).GetConnection()->GetTensorInfo(); + BOOST_ASSERT(inputShapes.size() == 2); + auto& input0 = inputShapes[0]; + auto& input1 = inputShapes[1]; - // Get the max of the inputs + // Get the max of the inputs. BOOST_ASSERT(input0.GetNumDimensions() == input1.GetNumDimensions()); unsigned int numDims = input0.GetNumDimensions(); std::vector dims(numDims); - // validate inputs are broadcast compatible -#if !NDEBUG for (unsigned int i = 0; i < numDims; i++) { - unsigned int dim0 = input0.GetShape()[i]; - unsigned int dim1 = input1.GetShape()[i]; + unsigned int dim0 = input0[i]; + unsigned int dim1 = input1[i]; + + // Validates inputs are broadcast compatible. +#if !NDEBUG if (dim0 != dim1) { BOOST_ASSERT_MSG(dim0 == 1 || dim1 == 1, "Dimensions should either match or one should be of size 1."); } - } #endif - for (unsigned int i = 0; i < numDims; i++) - { - unsigned int dim0 = input0.GetShape()[i]; - unsigned int dim1 = input1.GetShape()[i]; dims[i] = std::max(dim0, dim1); } - TensorShape outShape(numDims, dims.data()); + return std::vector({ TensorShape(numDims, dims.data()) }); +} + +void AdditionLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(2, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + GetInputSlot(1).GetConnection()->GetTensorInfo().GetShape() + }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual( "AdditionLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/AdditionLayer.hpp b/src/armnn/layers/AdditionLayer.hpp index c48c027763..37f0b5c259 100644 --- a/src/armnn/layers/AdditionLayer.hpp +++ b/src/armnn/layers/AdditionLayer.hpp @@ -19,6 +19,8 @@ public: void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; + protected: AdditionLayer(const char* name); ~AdditionLayer() = default; diff --git a/src/armnn/layers/BatchNormalizationLayer.cpp b/src/armnn/layers/BatchNormalizationLayer.cpp index ebb8954ea7..0bf81ebec9 100644 --- a/src/armnn/layers/BatchNormalizationLayer.cpp +++ b/src/armnn/layers/BatchNormalizationLayer.cpp @@ -21,12 +21,19 @@ BatchNormalizationLayer::BatchNormalizationLayer(const armnn::BatchNormalization std::unique_ptr BatchNormalizationLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { + // on this level constant data should not be released.. + BOOST_ASSERT_MSG(m_Mean != nullptr, "BatchNormalizationLayer: Mean data should not be null."); + BOOST_ASSERT_MSG(m_Variance != nullptr, "BatchNormalizationLayer: Variance data should not be null."); + BOOST_ASSERT_MSG(m_Beta != nullptr, "BatchNormalizationLayer: Beta data should not be null."); + BOOST_ASSERT_MSG(m_Gamma != nullptr, "BatchNormalizationLayer: Gamma data should not be null."); + BatchNormalizationQueueDescriptor descriptor; descriptor.m_Mean = m_Mean.get(); descriptor.m_Variance = m_Variance.get(); descriptor.m_Beta = m_Beta.get(); descriptor.m_Gamma = m_Gamma.get(); + return factory.CreateBatchNormalization(descriptor, PrepInfoAndDesc(descriptor, graph)); } @@ -44,17 +51,22 @@ BatchNormalizationLayer* BatchNormalizationLayer::Clone(Graph& graph) const void BatchNormalizationLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "BatchNormalizationLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "BatchNormalizationLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); - auto& info = GetInputSlot(0).GetConnection()->GetTensorInfo(); + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual( "BatchNormalizationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - info.GetShape()); + inferredShapes[0]); + +} + +Layer::ConstantTensors BatchNormalizationLayer::GetConstantTensorsByRef() +{ + return {m_Mean, m_Variance, m_Beta, m_Gamma}; } } // namespace armnn diff --git a/src/armnn/layers/BatchNormalizationLayer.hpp b/src/armnn/layers/BatchNormalizationLayer.hpp index d8082e5e98..9a1b5bccc8 100644 --- a/src/armnn/layers/BatchNormalizationLayer.hpp +++ b/src/armnn/layers/BatchNormalizationLayer.hpp @@ -29,6 +29,8 @@ public: protected: BatchNormalizationLayer(const BatchNormalizationDescriptor& param, const char* name); ~BatchNormalizationLayer() = default; + + ConstantTensors GetConstantTensorsByRef() override; }; } // namespace diff --git a/src/armnn/layers/ConstantLayer.cpp b/src/armnn/layers/ConstantLayer.cpp index 937d38a31d..2abc595605 100644 --- a/src/armnn/layers/ConstantLayer.cpp +++ b/src/armnn/layers/ConstantLayer.cpp @@ -13,9 +13,8 @@ namespace armnn { -ConstantLayer::ConstantLayer(const std::shared_ptr& input, const char* name) +ConstantLayer::ConstantLayer(const char* name) : Layer(0, 1, LayerType::Constant, name) - , m_LayerOutput(input) { } @@ -29,13 +28,22 @@ std::unique_ptr ConstantLayer::CreateWorkload(const Graph& graph, ConstantLayer* ConstantLayer::Clone(Graph& graph) const { - // Cloned layers share the same layer output object - return CloneBase(graph, m_LayerOutput, GetName()); + // Cloned layers share the same layer output object. + auto layer = CloneBase(graph, GetName()); + + layer->m_LayerOutput = m_LayerOutput ? std::make_unique(*m_LayerOutput) : nullptr; + + return std::move(layer); +} + +std::vector ConstantLayer::InferOutputShapes(const std::vector& inputShapes) const +{ + return std::vector({ m_LayerOutput->GetTensorInfo().GetShape() }); } void ConstantLayer::ValidateTensorShapesFromInputs() { - // get the output shape from the value of the constant layer + // Get the output shape from the value of the constant layer. TensorShape const& outShape = m_LayerOutput->GetTensorInfo().GetShape(); ConditionalThrowIfNotEqual( "ConstantLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", diff --git a/src/armnn/layers/ConstantLayer.hpp b/src/armnn/layers/ConstantLayer.hpp index e8e8d2298c..f215832eae 100644 --- a/src/armnn/layers/ConstantLayer.hpp +++ b/src/armnn/layers/ConstantLayer.hpp @@ -21,12 +21,18 @@ public: void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; + + // Free up the constant source data + void ReleaseConstantData() override {}; + + std::unique_ptr m_LayerOutput; protected: - ConstantLayer(const std::shared_ptr& input, const char* name); + ConstantLayer(const char* name); ~ConstantLayer() = default; -private: - std::shared_ptr m_LayerOutput; + ConstantTensors GetConstantTensorsByRef() override { return {m_LayerOutput}; } + }; } // namespace diff --git a/src/armnn/layers/ConvertFp16ToFp32Layer.cpp b/src/armnn/layers/ConvertFp16ToFp32Layer.cpp new file mode 100644 index 0000000000..80d981c267 --- /dev/null +++ b/src/armnn/layers/ConvertFp16ToFp32Layer.cpp @@ -0,0 +1,48 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ConvertFp16ToFp32Layer.hpp" +#include "LayerCloneBase.hpp" + +#include + +#include +#include + +namespace armnn +{ + +ConvertFp16ToFp32Layer::ConvertFp16ToFp32Layer(const char* name) + : Layer(1, 1, LayerType::ConvertFp16ToFp32, name) +{ +} + +std::unique_ptr ConvertFp16ToFp32Layer::CreateWorkload(const Graph& graph, + const IWorkloadFactory& factory) const +{ + ConvertFp16ToFp32QueueDescriptor descriptor; + return factory.CreateConvertFp16ToFp32(descriptor, PrepInfoAndDesc(descriptor, graph)); +} + +ConvertFp16ToFp32Layer* ConvertFp16ToFp32Layer::Clone(Graph& graph) const +{ + return CloneBase(graph, GetName()); +} + +void ConvertFp16ToFp32Layer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); + + ConditionalThrowIfNotEqual( + "ConvertFp16ToFp32Layer: TensorShape set on OutputSlot[0] does not match the inferred shape.", + GetOutputSlot(0).GetTensorInfo().GetShape(), + inferredShapes[0]); +} + +} // namespace armnn diff --git a/src/armnn/layers/ConvertFp16ToFp32Layer.hpp b/src/armnn/layers/ConvertFp16ToFp32Layer.hpp new file mode 100644 index 0000000000..94f1fb8925 --- /dev/null +++ b/src/armnn/layers/ConvertFp16ToFp32Layer.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include + +namespace armnn +{ + +class ConvertFp16ToFp32Layer : public Layer +{ +public: + virtual std::unique_ptr CreateWorkload(const Graph& graph, + const IWorkloadFactory& factory) const override; + + ConvertFp16ToFp32Layer* Clone(Graph& graph) const override; + + void ValidateTensorShapesFromInputs() override; + +protected: + ConvertFp16ToFp32Layer(const char* name); + ~ConvertFp16ToFp32Layer() = default; +}; + +} // namespace diff --git a/src/armnn/layers/ConvertFp32ToFp16Layer.cpp b/src/armnn/layers/ConvertFp32ToFp16Layer.cpp new file mode 100644 index 0000000000..70d6b668f8 --- /dev/null +++ b/src/armnn/layers/ConvertFp32ToFp16Layer.cpp @@ -0,0 +1,47 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "ConvertFp32ToFp16Layer.hpp" + +#include "LayerCloneBase.hpp" + +#include +#include +#include + +namespace armnn +{ + +ConvertFp32ToFp16Layer::ConvertFp32ToFp16Layer(const char* name) + : Layer(1, 1, LayerType::ConvertFp32ToFp16, name) +{ +} + +std::unique_ptr ConvertFp32ToFp16Layer::CreateWorkload(const Graph& graph, + const IWorkloadFactory& factory) const +{ + ConvertFp32ToFp16QueueDescriptor descriptor; + return factory.CreateConvertFp32ToFp16(descriptor, PrepInfoAndDesc(descriptor, graph)); +} + +ConvertFp32ToFp16Layer* ConvertFp32ToFp16Layer::Clone(Graph& graph) const +{ + return CloneBase(graph, GetName()); +} + +void ConvertFp32ToFp16Layer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); + + ConditionalThrowIfNotEqual( + "ConvertFp32ToFp16Layer: TensorShape set on OutputSlot[0] does not match the inferred shape.", + GetOutputSlot(0).GetTensorInfo().GetShape(), + inferredShapes[0]); +} + +} // namespace armnn diff --git a/src/armnn/layers/ConvertFp32ToFp16Layer.hpp b/src/armnn/layers/ConvertFp32ToFp16Layer.hpp new file mode 100644 index 0000000000..5c3883021d --- /dev/null +++ b/src/armnn/layers/ConvertFp32ToFp16Layer.hpp @@ -0,0 +1,27 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include + +namespace armnn +{ + +class ConvertFp32ToFp16Layer : public Layer +{ +public: + virtual std::unique_ptr CreateWorkload(const Graph& graph, + const IWorkloadFactory& factory) const override; + + ConvertFp32ToFp16Layer* Clone(Graph& graph) const override; + + void ValidateTensorShapesFromInputs() override; + +protected: + ConvertFp32ToFp16Layer(const char* name); + ~ConvertFp32ToFp16Layer() = default; +}; + +} // namespace diff --git a/src/armnn/layers/Convolution2dLayer.cpp b/src/armnn/layers/Convolution2dLayer.cpp index 3829f129bb..05c25bf3a0 100644 --- a/src/armnn/layers/Convolution2dLayer.cpp +++ b/src/armnn/layers/Convolution2dLayer.cpp @@ -20,11 +20,15 @@ Convolution2dLayer::Convolution2dLayer(const Convolution2dDescriptor& param, con std::unique_ptr Convolution2dLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { + // on this level constant data should not be released.. + BOOST_ASSERT_MSG(m_Weight != nullptr, "Convolution2dLayer: Weights data should not be null."); + Convolution2dQueueDescriptor descriptor; descriptor.m_Weight = m_Weight.get(); if (m_Param.m_BiasEnabled) { + BOOST_ASSERT_MSG(m_Bias != nullptr, "Convolution2dLayer: Bias data should not be null."); descriptor.m_Bias = m_Bias.get(); } return factory.CreateConvolution2d(descriptor, PrepInfoAndDesc(descriptor, graph)); @@ -33,6 +37,7 @@ std::unique_ptr Convolution2dLayer::CreateWorkload(const Graph& graph Convolution2dLayer* Convolution2dLayer::Clone(Graph& graph) const { auto layer = CloneBase(graph, m_Param, GetName()); + layer->m_Weight = m_Weight ? std::make_unique(*m_Weight) : nullptr; if (layer->m_Param.m_BiasEnabled) @@ -43,17 +48,11 @@ Convolution2dLayer* Convolution2dLayer::Clone(Graph& graph) const return std::move(layer); } -void Convolution2dLayer::ValidateTensorShapesFromInputs() +std::vector Convolution2dLayer::InferOutputShapes(const std::vector& inputShapes) const { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "Convolution2dLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "Convolution2dLayer: TensorInfo must be set on connected OutputSlot."); - - - IOutputSlot* input = GetInputSlot(0).GetConnection(); - const TensorShape& inputShape = input->GetTensorInfo().GetShape(); - const TensorShape filterShape = m_Weight->GetTensorInfo().GetShape(); + BOOST_ASSERT(inputShapes.size() == 2); + const TensorShape& inputShape = inputShapes[0]; + const TensorShape filterShape = inputShapes[1]; // If we support multiple batch dimensions in the future, then this assert will need to change. BOOST_ASSERT_MSG(inputShape.GetNumDimensions() == 4, "Convolutions will always have 4D input."); @@ -73,11 +72,31 @@ void Convolution2dLayer::ValidateTensorShapesFromInputs() unsigned int outChannels = filterShape[0]; unsigned int outBatchSize = inBatchSize; - TensorShape shapeOut({outBatchSize, outChannels, outHeight, outWidth}); + return std::vector({ TensorShape({outBatchSize, outChannels, outHeight, outWidth})}); +} + +void Convolution2dLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + // check if we m_Weight data is not nullptr + BOOST_ASSERT_MSG(m_Weight != nullptr, "Convolution2dLayer: Weights data should not be null."); + + auto inferredShapes = InferOutputShapes({ + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + m_Weight->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual( "Convolution2dLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - shapeOut); + inferredShapes[0]); +} + +Layer::ConstantTensors Convolution2dLayer::GetConstantTensorsByRef() +{ + return {m_Weight, m_Bias}; } } // namespace armnn diff --git a/src/armnn/layers/Convolution2dLayer.hpp b/src/armnn/layers/Convolution2dLayer.hpp index 4d2c6505d3..8659fe540d 100644 --- a/src/armnn/layers/Convolution2dLayer.hpp +++ b/src/armnn/layers/Convolution2dLayer.hpp @@ -24,9 +24,13 @@ public: void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; + protected: Convolution2dLayer(const Convolution2dDescriptor& param, const char* name); ~Convolution2dLayer() = default; + + ConstantTensors GetConstantTensorsByRef() override; }; } // namespace diff --git a/src/armnn/layers/DepthwiseConvolution2dLayer.cpp b/src/armnn/layers/DepthwiseConvolution2dLayer.cpp index 0442de6c60..471bf015a9 100644 --- a/src/armnn/layers/DepthwiseConvolution2dLayer.cpp +++ b/src/armnn/layers/DepthwiseConvolution2dLayer.cpp @@ -22,11 +22,15 @@ DepthwiseConvolution2dLayer::DepthwiseConvolution2dLayer(const DepthwiseConvolut std::unique_ptr DepthwiseConvolution2dLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { + // on this level constant data should not be released.. + BOOST_ASSERT_MSG(m_Weight != nullptr, "DepthwiseConvolution2dLayer: Weights data should not be null."); + DepthwiseConvolution2dQueueDescriptor descriptor; descriptor.m_Weight = m_Weight.get(); if (m_Param.m_BiasEnabled) { + BOOST_ASSERT_MSG(m_Bias != nullptr, "DepthwiseConvolution2dLayer: Bias data should not be null."); descriptor.m_Bias = m_Bias.get(); } return factory.CreateDepthwiseConvolution2d(descriptor, PrepInfoAndDesc(descriptor, graph)); @@ -45,16 +49,12 @@ DepthwiseConvolution2dLayer* DepthwiseConvolution2dLayer::Clone(Graph& graph) co return std::move(layer); } -void DepthwiseConvolution2dLayer::ValidateTensorShapesFromInputs() +std::vector +DepthwiseConvolution2dLayer::InferOutputShapes(const std::vector& inputShapes) const { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "DepthwiseConvolution2dLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "DepthwiseConvolution2dLayer: TensorInfo must be set on connected OutputSlot."); - - IOutputSlot* input = GetInputSlot(0).GetConnection(); - const TensorShape& inputShape = input->GetTensorInfo().GetShape(); - const TensorShape filterShape = m_Weight->GetTensorInfo().GetShape(); + BOOST_ASSERT(inputShapes.size() == 2); + const TensorShape& inputShape = inputShapes[0]; + const TensorShape filterShape = inputShapes[1]; BOOST_ASSERT_MSG(inputShape.GetNumDimensions() == 4, "Convolutions will always have 4D input."); @@ -74,12 +74,32 @@ void DepthwiseConvolution2dLayer::ValidateTensorShapesFromInputs() unsigned int outChannels = filterShape[1]*depthMultiplier; unsigned int outBatchSize = inBatchSize; - TensorShape outShape({outBatchSize, outChannels, outHeight, outWidth}); + return std::vector({ TensorShape({outBatchSize, outChannels, outHeight, outWidth})}); +} + +void DepthwiseConvolution2dLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + // on this level constant data should not be released.. + BOOST_ASSERT_MSG(m_Weight != nullptr, "DepthwiseConvolution2dLayer: Weights data should not be null."); + + auto inferredShapes = InferOutputShapes({ + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + m_Weight->GetTensorInfo().GetShape() + }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual( - "DepthwiseConvolution2dLayer: " - "TensorShape set on OutputSlot[0] does not match the inferred shape.", + "DepthwiseConvolution2dLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); +} + +Layer::ConstantTensors DepthwiseConvolution2dLayer::GetConstantTensorsByRef() +{ + return {m_Weight, m_Bias}; } } // namespace armnn diff --git a/src/armnn/layers/DepthwiseConvolution2dLayer.hpp b/src/armnn/layers/DepthwiseConvolution2dLayer.hpp index 60691bf73c..e3be152432 100644 --- a/src/armnn/layers/DepthwiseConvolution2dLayer.hpp +++ b/src/armnn/layers/DepthwiseConvolution2dLayer.hpp @@ -24,9 +24,13 @@ public: void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; + protected: DepthwiseConvolution2dLayer(const DepthwiseConvolution2dDescriptor& param, const char* name); ~DepthwiseConvolution2dLayer() = default; + + ConstantTensors GetConstantTensorsByRef() override; }; } // namespace diff --git a/src/armnn/layers/FakeQuantizationLayer.cpp b/src/armnn/layers/FakeQuantizationLayer.cpp index 24b53b2e37..7bda1c1f78 100644 --- a/src/armnn/layers/FakeQuantizationLayer.cpp +++ b/src/armnn/layers/FakeQuantizationLayer.cpp @@ -32,20 +32,16 @@ FakeQuantizationLayer* FakeQuantizationLayer::Clone(Graph& graph) const void FakeQuantizationLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "FakeQuantizationLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "FakeQuantizationLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); - IOutputSlot* input = GetInputSlot(0).GetConnection(); + BOOST_ASSERT(inferredShapes.size() == 1); - // input and output shapes are the same - TensorShape const& outShape = input->GetTensorInfo().GetShape(); ConditionalThrowIfNotEqual( "FakeQuantizationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/FloorLayer.cpp b/src/armnn/layers/FloorLayer.cpp index a9ddcca60c..e88600b354 100644 --- a/src/armnn/layers/FloorLayer.cpp +++ b/src/armnn/layers/FloorLayer.cpp @@ -32,18 +32,16 @@ FloorLayer* FloorLayer::Clone(Graph& graph) const void FloorLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "FloorLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "FloorLayer: TensorInfo must be set on connected OutputSlot."); - - // input and output shapes are the same - IOutputSlot* input = GetInputSlot(0).GetConnection(); - TensorShape const& outShape = input->GetTensorInfo().GetShape(); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual( "FloorLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/FullyConnectedLayer.cpp b/src/armnn/layers/FullyConnectedLayer.cpp index 1597e8c2c3..8b8f010bdb 100644 --- a/src/armnn/layers/FullyConnectedLayer.cpp +++ b/src/armnn/layers/FullyConnectedLayer.cpp @@ -22,11 +22,15 @@ FullyConnectedLayer::FullyConnectedLayer(const FullyConnectedDescriptor& param, std::unique_ptr FullyConnectedLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { + // on this level constant data should not be released.. + BOOST_ASSERT_MSG(m_Weight != nullptr, "FullyConnectedLayer: Weights data should not be null."); + FullyConnectedQueueDescriptor descriptor; descriptor.m_Weight = m_Weight.get(); if (m_Param.m_BiasEnabled) { + BOOST_ASSERT_MSG(m_Bias != nullptr, "FullyConnectedLayer: Bias data should not be null."); descriptor.m_Bias = m_Bias.get(); } return factory.CreateFullyConnected(descriptor, PrepInfoAndDesc(descriptor, graph)); @@ -45,25 +49,41 @@ FullyConnectedLayer* FullyConnectedLayer::Clone(Graph& graph) const return std::move(layer); } +std::vector FullyConnectedLayer::InferOutputShapes(const std::vector& inputShapes) const +{ + BOOST_ASSERT(inputShapes.size() == 2); + const TensorShape& inputShape = inputShapes[0]; + const TensorShape weightShape = inputShapes[1]; + + // Output for FC is [1, w[1]]. + unsigned int batches = inputShape[0]; + unsigned int dimIdx = m_Param.m_TransposeWeightMatrix ? 0 : 1; + + return std::vector({ TensorShape({batches, weightShape[dimIdx]})}); +} + void FullyConnectedLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "FullyConnectedLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "FullyConnectedLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); + // check if we m_Weight data is not nullptr + BOOST_ASSERT_MSG(m_Weight != nullptr, "FullyConnectedLayer: Weights data should not be null."); - TensorShape const& weightShape = m_Weight->GetTensorInfo().GetShape(); + auto inferredShapes = InferOutputShapes({ + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + m_Weight->GetTensorInfo().GetShape() }); - // output for FC is [1, w[1]] - unsigned int batches = GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape()[0]; - unsigned int dimIdx = m_Param.m_TransposeWeightMatrix ? 0 : 1; - TensorShape outShape({batches, weightShape[dimIdx]}); + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual( "FullyConnectedLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); +} + +Layer::ConstantTensors FullyConnectedLayer::GetConstantTensorsByRef() +{ + return {m_Weight, m_Bias}; } } // namespace armnn diff --git a/src/armnn/layers/FullyConnectedLayer.hpp b/src/armnn/layers/FullyConnectedLayer.hpp index 1d6cb7cf8d..6300cafd62 100644 --- a/src/armnn/layers/FullyConnectedLayer.hpp +++ b/src/armnn/layers/FullyConnectedLayer.hpp @@ -23,10 +23,13 @@ public: FullyConnectedLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; protected: FullyConnectedLayer(const FullyConnectedDescriptor& param, const char* name); ~FullyConnectedLayer() = default; + + ConstantTensors GetConstantTensorsByRef() override; }; } // namespace diff --git a/src/armnn/layers/L2NormalizationLayer.cpp b/src/armnn/layers/L2NormalizationLayer.cpp index 07020bfdca..7249bc3b5c 100644 --- a/src/armnn/layers/L2NormalizationLayer.cpp +++ b/src/armnn/layers/L2NormalizationLayer.cpp @@ -32,19 +32,16 @@ L2NormalizationLayer* L2NormalizationLayer::Clone(Graph& graph) const void L2NormalizationLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "L2NormalizationLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "L2NormalizationLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); - IOutputSlot* input = GetInputSlot(0).GetConnection(); + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); - // input and output shapes are the same - TensorShape const& outShape = input->GetTensorInfo().GetShape(); ConditionalThrowIfNotEqual( "L2NormalizationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/LayerWithParameters.hpp b/src/armnn/layers/LayerWithParameters.hpp index e3eb40a273..c071c15c21 100644 --- a/src/armnn/layers/LayerWithParameters.hpp +++ b/src/armnn/layers/LayerWithParameters.hpp @@ -18,7 +18,7 @@ public: const Parameters& GetParameters() const { return m_Param; } /// Helper to serialize the layer parameters to string - /// (currently used in DotSerializer and company) + /// (currently used in DotSerializer and company). void SerializeLayerParameters(ParameterStringifyFunction & fn) const { StringifyLayerParameters::Serialize(fn, m_Param); @@ -37,7 +37,7 @@ protected: ~LayerWithParameters() = default; - /// Helper function to reduce duplication in *Layer::CreateWorkload + /// Helper function to reduce duplication in *Layer::CreateWorkload. template WorkloadInfo PrepInfoAndDesc(QueueDescriptor& descriptor, const Graph& graph) const { @@ -45,7 +45,7 @@ protected: return Layer::PrepInfoAndDesc(descriptor, graph); } - /// The parameters for the layer (not including tensor-valued weights etc.) + /// The parameters for the layer (not including tensor-valued weights etc.). Parameters m_Param; }; diff --git a/src/armnn/layers/LstmLayer.cpp b/src/armnn/layers/LstmLayer.cpp new file mode 100644 index 0000000000..30c41bc9b8 --- /dev/null +++ b/src/armnn/layers/LstmLayer.cpp @@ -0,0 +1,259 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "LstmLayer.hpp" + +#include "LayerCloneBase.hpp" + +#include +#include +#include + +namespace armnn +{ + +LstmLayer::LstmLayer(const LstmDescriptor& param, const char* name) + : LayerWithParameters(3, 4, LayerType::Lstm, param, name) +{ +} + +std::unique_ptr LstmLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const +{ + LstmQueueDescriptor descriptor; + + // Basic parameters + descriptor.m_InputToForgetWeights = m_BasicParameters.m_InputToForgetWeights.get(); + descriptor.m_InputToCellWeights = m_BasicParameters.m_InputToCellWeights.get(); + descriptor.m_InputToOutputWeights = m_BasicParameters.m_InputToOutputWeights.get(); + descriptor.m_RecurrentToForgetWeights = m_BasicParameters.m_RecurrentToForgetWeights.get(); + descriptor.m_RecurrentToCellWeights = m_BasicParameters.m_RecurrentToCellWeights.get(); + descriptor.m_RecurrentToOutputWeights = m_BasicParameters.m_RecurrentToOutputWeights.get(); + descriptor.m_ForgetGateBias = m_BasicParameters.m_ForgetGateBias.get(); + descriptor.m_CellBias = m_BasicParameters.m_CellBias.get(); + descriptor.m_OutputGateBias = m_BasicParameters.m_OutputGateBias.get(); + + // Cifg parameters + if (!m_Param.m_CifgEnabled) + { + descriptor.m_InputToInputWeights = m_CifgParameters.m_InputToInputWeights.get(); + descriptor.m_RecurrentToInputWeights = m_CifgParameters.m_RecurrentToInputWeights.get(); + descriptor.m_CellToInputWeights = m_CifgParameters.m_CellToInputWeights.get(); + descriptor.m_InputGateBias = m_CifgParameters.m_InputGateBias.get(); + } + + // Projection parameters + if (m_Param.m_ProjectionEnabled) + { + descriptor.m_ProjectionWeights = m_ProjectionParameters.m_ProjectionWeights.get(); + descriptor.m_ProjectionBias = m_ProjectionParameters.m_ProjectionBias.get(); + } + + // Peephole parameters + if (m_Param.m_PeepholeEnabled) + { + descriptor.m_CellToForgetWeights = m_PeepholeParameters.m_CellToForgetWeights.get(); + descriptor.m_CellToOutputWeights = m_PeepholeParameters.m_CellToOutputWeights.get(); + } + return factory.CreateLstm(descriptor, PrepInfoAndDesc(descriptor, graph)); +} + +LstmLayer* LstmLayer::Clone(Graph& graph) const +{ + auto layer = CloneBase(graph, m_Param, GetName()); + + layer->m_BasicParameters.m_InputToForgetWeights = m_BasicParameters.m_InputToForgetWeights ? + std::make_unique(*m_BasicParameters.m_InputToForgetWeights) + : nullptr; + layer->m_BasicParameters.m_InputToCellWeights = m_BasicParameters.m_InputToCellWeights ? + std::make_unique(*m_BasicParameters.m_InputToCellWeights) : nullptr; + layer->m_BasicParameters.m_InputToOutputWeights = m_BasicParameters.m_InputToOutputWeights ? + std::make_unique(*m_BasicParameters.m_InputToOutputWeights) : nullptr; + layer->m_BasicParameters.m_RecurrentToForgetWeights = m_BasicParameters.m_RecurrentToForgetWeights ? + std::make_unique(*m_BasicParameters.m_RecurrentToForgetWeights) : nullptr; + layer->m_BasicParameters.m_RecurrentToCellWeights = m_BasicParameters.m_RecurrentToCellWeights ? + std::make_unique(*m_BasicParameters.m_RecurrentToCellWeights) : nullptr; + layer->m_BasicParameters.m_RecurrentToOutputWeights = m_BasicParameters.m_RecurrentToOutputWeights ? + std::make_unique(*m_BasicParameters.m_RecurrentToOutputWeights) : nullptr; + layer->m_BasicParameters.m_ForgetGateBias = m_BasicParameters.m_ForgetGateBias ? + std::make_unique(*m_BasicParameters.m_ForgetGateBias) : nullptr; + layer->m_BasicParameters.m_CellBias = m_BasicParameters.m_CellBias ? + std::make_unique(*m_BasicParameters.m_CellBias) : nullptr; + layer->m_BasicParameters.m_OutputGateBias = m_BasicParameters.m_OutputGateBias ? + std::make_unique(*m_BasicParameters.m_OutputGateBias) : nullptr; + + if (!m_Param.m_CifgEnabled) + { + layer->m_CifgParameters.m_InputToInputWeights = m_CifgParameters.m_InputToInputWeights ? + std::make_unique(*m_CifgParameters.m_InputToInputWeights) : nullptr; + layer->m_CifgParameters.m_RecurrentToInputWeights = m_CifgParameters.m_RecurrentToInputWeights ? + std::make_unique(*m_CifgParameters.m_RecurrentToInputWeights) : nullptr; + layer->m_CifgParameters.m_CellToInputWeights = m_CifgParameters.m_CellToInputWeights ? + std::make_unique(*m_CifgParameters.m_CellToInputWeights) : nullptr; + layer->m_CifgParameters.m_InputGateBias = m_CifgParameters.m_InputGateBias ? + std::make_unique(*m_CifgParameters.m_InputGateBias) : nullptr; + } + + if (m_Param.m_ProjectionEnabled) + { + layer->m_ProjectionParameters.m_ProjectionWeights = m_ProjectionParameters.m_ProjectionWeights ? + std::make_unique(*m_ProjectionParameters.m_ProjectionWeights) : nullptr; + layer->m_ProjectionParameters.m_ProjectionBias = m_ProjectionParameters.m_ProjectionBias ? + std::make_unique(*m_ProjectionParameters.m_ProjectionBias) : nullptr; + } + + if (m_Param.m_PeepholeEnabled) + { + layer->m_PeepholeParameters.m_CellToForgetWeights = m_PeepholeParameters.m_CellToForgetWeights ? + std::make_unique(*m_PeepholeParameters.m_CellToForgetWeights) : nullptr; + layer->m_PeepholeParameters.m_CellToOutputWeights = m_PeepholeParameters.m_CellToOutputWeights ? + std::make_unique(*m_PeepholeParameters.m_CellToOutputWeights) : nullptr; + } + + return std::move(layer); +} + +std::vector LstmLayer::InferOutputShapes(const std::vector& inputShapes) const +{ + BOOST_ASSERT(inputShapes.size() == 3); + + // Get input values for validation + unsigned int batchSize = inputShapes[0][0]; + unsigned int outputSize = inputShapes[1][1]; + unsigned int numUnits = inputShapes[2][1]; + + std::vector outShapes; + if (!m_Param.m_CifgEnabled) + { + outShapes.push_back(TensorShape({batchSize, numUnits*3})); + } + else + { + outShapes.push_back(TensorShape({batchSize, numUnits*4})); + } + outShapes.push_back(TensorShape({batchSize, outputSize})); + outShapes.push_back(TensorShape({batchSize, numUnits})); + outShapes.push_back(TensorShape({batchSize, outputSize})); + + return outShapes; +} + +void LstmLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(3, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes( { + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + GetInputSlot(1).GetConnection()->GetTensorInfo().GetShape(), + GetInputSlot(2).GetConnection()->GetTensorInfo().GetShape()} + ); + + BOOST_ASSERT(inferredShapes.size() == 4); + + // Check if the weights are nullptr + BOOST_ASSERT_MSG(m_BasicParameters.m_InputToForgetWeights != nullptr, + "LstmLayer: m_BasicParameters.m_InputToForgetWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_InputToCellWeights != nullptr, + "LstmLayer: m_BasicParameters.m_InputToCellWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_InputToOutputWeights != nullptr, + "LstmLayer: m_BasicParameters.m_InputToOutputWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_RecurrentToForgetWeights != nullptr, + "LstmLayer: m_BasicParameters.m_RecurrentToForgetWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_RecurrentToCellWeights != nullptr, + "LstmLayer: m_BasicParameters.m_RecurrentToCellWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_RecurrentToOutputWeights != nullptr, + "LstmLayer: m_BasicParameters.m_RecurrentToOutputWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_ForgetGateBias != nullptr, + "LstmLayer: m_BasicParameters.m_ForgetGateBias should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_CellBias != nullptr, + "LstmLayer: m_BasicParameters.m_CellBias should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_OutputGateBias != nullptr, + "LstmLayer: m_BasicParameters.m_OutputGateBias should not be null."); + + if (!m_Param.m_CifgEnabled) + { + BOOST_ASSERT_MSG(m_CifgParameters.m_InputToInputWeights != nullptr, + "LstmLayer: m_CifgParameters.m_InputToInputWeights should not be null."); + BOOST_ASSERT_MSG(m_CifgParameters.m_RecurrentToInputWeights != nullptr, + "LstmLayer: m_CifgParameters.m_RecurrentToInputWeights should not be null."); + BOOST_ASSERT_MSG(m_CifgParameters.m_InputGateBias != nullptr, + "LstmLayer: m_CifgParameters.m_InputGateBias should not be null."); + + ConditionalThrowIfNotEqual( + "LstmLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", + GetOutputSlot(0).GetTensorInfo().GetShape(), + inferredShapes[0]); + } + else + { + BOOST_ASSERT_MSG(m_CifgParameters.m_InputToInputWeights == nullptr, + "LstmLayer: m_CifgParameters.m_InputToInputWeights should not have a value when CIFG is enabled."); + BOOST_ASSERT_MSG(m_CifgParameters.m_RecurrentToInputWeights == nullptr, + "LstmLayer: m_CifgParameters.m_RecurrentToInputWeights should not have a value when CIFG is enabled."); + BOOST_ASSERT_MSG(m_CifgParameters.m_CellToInputWeights == nullptr, + "LstmLayer: m_CifgParameters.m_CellToInputWeights should not have a value when CIFG is enabled."); + BOOST_ASSERT_MSG(m_CifgParameters.m_InputGateBias == nullptr, + "LstmLayer: m_CifgParameters.m_InputGateBias should not have a value when CIFG is enabled."); + + ConditionalThrowIfNotEqual( + "LstmLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", + GetOutputSlot(0).GetTensorInfo().GetShape(), + inferredShapes[0]); + } + + if (m_Param.m_ProjectionEnabled) + { + BOOST_ASSERT_MSG(m_ProjectionParameters.m_ProjectionWeights != nullptr, + "LstmLayer: m_ProjectionParameters.m_ProjectionWeights should not be null."); + } + + if (m_Param.m_PeepholeEnabled) + { + BOOST_ASSERT_MSG(m_PeepholeParameters.m_CellToForgetWeights != nullptr, + "LstmLayer: m_PeepholeParameters.m_CellToForgetWeights should not be null."); + BOOST_ASSERT_MSG(m_PeepholeParameters.m_CellToOutputWeights != nullptr, + "LstmLayer: m_PeepholeParameters.m_CellToOutputWeights should not be null."); + } + + ConditionalThrowIfNotEqual( + "LstmLayer: TensorShape set on OutputSlot[1] does not match the inferred shape.", + GetOutputSlot(1).GetTensorInfo().GetShape(), + inferredShapes[1]); + ConditionalThrowIfNotEqual( + "LstmLayer: TensorShape set on OutputSlot[2] does not match the inferred shape.", + GetOutputSlot(2).GetTensorInfo().GetShape(), + inferredShapes[2]); + ConditionalThrowIfNotEqual( + "LstmLayer: TensorShape set on OutputSlot[3] does not match the inferred shape.", + GetOutputSlot(3).GetTensorInfo().GetShape(), + inferredShapes[3]); +} + +Layer::ConstantTensors LstmLayer::GetConstantTensorsByRef() +{ + return {m_BasicParameters.m_InputToForgetWeights, + m_BasicParameters.m_InputToCellWeights, + m_BasicParameters.m_InputToOutputWeights, + m_BasicParameters.m_RecurrentToForgetWeights, + m_BasicParameters.m_RecurrentToCellWeights, + m_BasicParameters.m_RecurrentToOutputWeights, + m_BasicParameters.m_ForgetGateBias, + m_BasicParameters.m_CellBias, + m_BasicParameters.m_OutputGateBias, + + // Cifg parameters + m_CifgParameters.m_InputToInputWeights, + m_CifgParameters.m_RecurrentToInputWeights, + m_CifgParameters.m_CellToInputWeights, + m_CifgParameters.m_InputGateBias, + + // Projection parameters + m_ProjectionParameters.m_ProjectionWeights, + m_ProjectionParameters.m_ProjectionBias, + + // Peephole parameters + m_PeepholeParameters.m_CellToForgetWeights, + m_PeepholeParameters.m_CellToOutputWeights}; +} + +} // namespace armnn diff --git a/src/armnn/layers/LstmLayer.hpp b/src/armnn/layers/LstmLayer.hpp new file mode 100644 index 0000000000..7133ad26a5 --- /dev/null +++ b/src/armnn/layers/LstmLayer.hpp @@ -0,0 +1,70 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "LayerWithParameters.hpp" + +namespace armnn +{ + +class ScopedCpuTensorHandle; + +struct LstmOptCifgParameters +{ + std::unique_ptr m_InputToInputWeights; + std::unique_ptr m_RecurrentToInputWeights; + std::unique_ptr m_CellToInputWeights; + std::unique_ptr m_InputGateBias; +}; + +struct LstmOptProjectionParameters +{ + std::unique_ptr m_ProjectionWeights; + std::unique_ptr m_ProjectionBias; +}; + +struct LstmOptPeepholeParameters +{ + std::unique_ptr m_CellToForgetWeights; + std::unique_ptr m_CellToOutputWeights; +}; + +struct LstmBasicParameters +{ + std::unique_ptr m_InputToForgetWeights; + std::unique_ptr m_InputToCellWeights; + std::unique_ptr m_InputToOutputWeights; + std::unique_ptr m_RecurrentToForgetWeights; + std::unique_ptr m_RecurrentToCellWeights; + std::unique_ptr m_RecurrentToOutputWeights; + std::unique_ptr m_ForgetGateBias; + std::unique_ptr m_CellBias; + std::unique_ptr m_OutputGateBias; +}; + +class LstmLayer : public LayerWithParameters +{ +public: + + LstmBasicParameters m_BasicParameters; + LstmOptCifgParameters m_CifgParameters; + LstmOptProjectionParameters m_ProjectionParameters; + LstmOptPeepholeParameters m_PeepholeParameters; + + virtual std::unique_ptr CreateWorkload(const Graph& graph, + const IWorkloadFactory& factory) const override; + LstmLayer* Clone(Graph& graph) const override; + + void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; + +protected: + LstmLayer(const LstmDescriptor& param, const char* name); + ~LstmLayer() = default; + + Layer::ConstantTensors GetConstantTensorsByRef() override; +}; + +} // namespace diff --git a/src/armnn/layers/MemCopyLayer.cpp b/src/armnn/layers/MemCopyLayer.cpp index 973a756b21..83f77edf58 100644 --- a/src/armnn/layers/MemCopyLayer.cpp +++ b/src/armnn/layers/MemCopyLayer.cpp @@ -9,6 +9,7 @@ #include #include #include +#include namespace armnn { @@ -26,23 +27,23 @@ MemCopyLayer* MemCopyLayer::Clone(Graph& graph) const std::unique_ptr MemCopyLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { MemCopyQueueDescriptor descriptor; - return factory.CreateMemCopy(descriptor, PrepInfoAndDesc(descriptor, graph)); + + //This is different from other workloads. Does not get created by the workload factory. + return std::make_unique(descriptor, PrepInfoAndDesc(descriptor, graph)); } void MemCopyLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "MemCopyLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "MemCopyLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); - IOutputSlot* input = GetInputSlot(0).GetConnection(); + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual( "MemCopyLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - input->GetTensorInfo().GetShape()); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/MergerLayer.cpp b/src/armnn/layers/MergerLayer.cpp index 065fc86a1b..e810b5e0bb 100644 --- a/src/armnn/layers/MergerLayer.cpp +++ b/src/armnn/layers/MergerLayer.cpp @@ -23,7 +23,7 @@ std::unique_ptr MergerLayer::CreateWorkload(const Graph& graph, const { MergerQueueDescriptor descriptor; - // copy the view origins to the descriptor + // Copies the view origins to the descriptor. descriptor.m_ViewOrigins.reserve(m_Param.GetNumViews()); for (unsigned int i = 0; i < m_Param.GetNumViews(); ++i) { @@ -36,9 +36,9 @@ std::unique_ptr MergerLayer::CreateWorkload(const Graph& graph, const void MergerLayer::CreateTensorHandles(Graph& graph, const IWorkloadFactory& factory) { - //if sub tensors are supported than the merger + //If sub tensors are supported than the merger //just needs to make sure that the outputs of the prev layer - //are made subtensors of the output of the merger layer + //are made subtensors of the output of the merger layer. m_OutputHandlers[0].CreateTensorHandles(factory); if (factory.SupportsSubTensors()) { @@ -76,33 +76,28 @@ MergerLayer* MergerLayer::Clone(Graph& graph) const return CloneBase(graph, m_Param, GetName()); } -void MergerLayer::ValidateTensorShapesFromInputs() +std::vector MergerLayer::InferOutputShapes(const std::vector& inputShapes) const { - // Validate Merger layer - ConditionalThrowIfNotEqual( - "MergerLayer: Num Inputs must match num views.", - m_Param.GetNumViews(), - GetNumInputSlots()); + BOOST_ASSERT(inputShapes.size() == m_Param.GetNumViews()); unsigned int numDims = m_Param.GetNumDimensions(); - for (unsigned int i=0; iGetTensorInfo(); + auto& inputShape = inputShapes[i]; - boost::ignore_unused(inputInfo); ConditionalThrowIfNotEqual( "MergerLayer: Num Dimensions must match all inputs.", numDims, - inputInfo.GetNumDimensions()); + inputShape.GetNumDimensions()); } - // Find the bounding box (extents) of all the views + // Finds the bounding box (extents) of all the views. std::vector extentMin(numDims); std::vector extentMax(numDims); - for (unsigned int i = 0; i < GetNumInputSlots(); i++) + for (unsigned int i = 0; i < inputShapes.size(); i++) { const uint32_t* origin = m_Param.GetViewOrigin(i); - const armnn::TensorShape& shape = GetInputSlot(i).GetConnection()->GetTensorInfo().GetShape(); + const armnn::TensorShape& shape = inputShapes[i]; for (unsigned int d = 0; d < numDims; d++) { extentMin[d] = std::min(extentMin[d], origin[d]); @@ -110,23 +105,23 @@ void MergerLayer::ValidateTensorShapesFromInputs() } } - // Check that the bounding box starts at the origin + // Checks that the bounding box starts at the origin. if (!std::all_of(extentMin.begin(), extentMin.end(), [](unsigned int s) { return s == 0; })) { throw LayerValidationException("MergerLayer: there is no view that starts at the origin"); } - // Check that there are no overlaps of views (this would lead to undefined output at those locations). - // Check each pair of views against each other - // (and don't bother to check against self, or check the same pair both ways round) - for (unsigned int a = 0; a < GetNumInputSlots(); a++) + // Checks that there are no overlaps of views (this would lead to undefined output at those locations). + // Checks each pair of views against each other + // (and doesn't bother to check against self, or check the same pair both ways round). + for (unsigned int a = 0; a < inputShapes.size(); a++) { const uint32_t* aOrigin = m_Param.GetViewOrigin(a); - const armnn::TensorShape& aShape = GetInputSlot(a).GetConnection()->GetTensorInfo().GetShape(); + const armnn::TensorShape& aShape = inputShapes[a]; for (unsigned int b = 0; b < a; b++) { const uint32_t* bOrigin = m_Param.GetViewOrigin(b); - const armnn::TensorShape& bShape = GetInputSlot(b).GetConnection()->GetTensorInfo().GetShape(); + const armnn::TensorShape& bShape = inputShapes[b]; bool allAxesOverlap = true; for (unsigned int d = 0; d < numDims && allAxesOverlap; d++) @@ -149,13 +144,13 @@ void MergerLayer::ValidateTensorShapesFromInputs() } } - // Check that there are no "holes", i.e. regions of the output which is not covered by a view. + // Checks that there are no "holes", i.e. regions of the output which is not covered by a view. // Because we already checked that there are no overlaps, this can be done simply by checking that // the total 'volume' of the views is the same as the output. unsigned int totalViewsVolume = 0; - for (unsigned int i = 0; i < GetNumInputSlots(); i++) + for (unsigned int i = 0; i < inputShapes.size(); i++) { - totalViewsVolume += GetInputSlot(i).GetConnection()->GetTensorInfo().GetNumElements(); + totalViewsVolume += inputShapes[i].GetNumElements(); } unsigned int outputVolume = 1; for (unsigned int d = 0; d < numDims; d++) @@ -168,11 +163,33 @@ void MergerLayer::ValidateTensorShapesFromInputs() totalViewsVolume, outputVolume); - TensorShape outShape(numDims, extentMax.data()); + return std::vector({ TensorShape({numDims, extentMax.data()}) }); +} + +void MergerLayer::ValidateTensorShapesFromInputs() +{ + // Validates Merger layer. + ConditionalThrowIfNotEqual( + "MergerLayer: Num Inputs must match num views.", + m_Param.GetNumViews(), + GetNumInputSlots()); + + VerifyLayerConnections(m_Param.GetNumViews(), CHECK_LOCATION()); + + std::vector inputShapes; + for (uint i = 0; i < GetNumInputSlots(); ++i) + { + inputShapes.push_back(GetInputSlot(i).GetConnection()->GetTensorInfo().GetShape()); + } + + auto inferredShapes = InferOutputShapes(inputShapes); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual( "MergerLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn armnn diff --git a/src/armnn/layers/MergerLayer.hpp b/src/armnn/layers/MergerLayer.hpp index ad94cb5f3a..b6261027d4 100644 --- a/src/armnn/layers/MergerLayer.hpp +++ b/src/armnn/layers/MergerLayer.hpp @@ -19,6 +19,7 @@ public: MergerLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; protected: MergerLayer(const OriginsDescriptor& param, const char* name); diff --git a/src/armnn/layers/MultiplicationLayer.cpp b/src/armnn/layers/MultiplicationLayer.cpp index af40a23007..ed7683da5f 100644 --- a/src/armnn/layers/MultiplicationLayer.cpp +++ b/src/armnn/layers/MultiplicationLayer.cpp @@ -31,41 +31,51 @@ MultiplicationLayer* MultiplicationLayer::Clone(Graph& graph) const return CloneBase(graph, GetName()); } -void MultiplicationLayer::ValidateTensorShapesFromInputs() +std::vector MultiplicationLayer::InferOutputShapes(const std::vector& inputShapes) const { - auto& input0 = GetInputSlot(0).GetConnection()->GetTensorInfo(); - auto& input1 = GetInputSlot(1).GetConnection()->GetTensorInfo(); + BOOST_ASSERT(inputShapes.size() == 2); + auto& input0 = inputShapes[0]; + auto& input1 = inputShapes[1]; - // Get the max of the inputs + // Get the max of the inputs. BOOST_ASSERT(input0.GetNumDimensions() == input1.GetNumDimensions()); unsigned int numDims = input0.GetNumDimensions(); std::vector dims(numDims); - // validate inputs are broadcast compatible -#if !NDEBUG for (unsigned int i = 0; i < numDims; i++) { - unsigned int dim0 = input0.GetShape()[i]; - unsigned int dim1 = input1.GetShape()[i]; + unsigned int dim0 = input0[i]; + unsigned int dim1 = input1[i]; + + // Validates inputs are broadcast compatible. +#if !NDEBUG if (dim0 != dim1) { BOOST_ASSERT_MSG(dim0 == 1 || dim1 == 1, "Dimensions should either match or one should be of size 1."); } - } #endif - for (unsigned int i = 0; i < numDims; i++) - { - unsigned int dim0 = input0.GetShape()[i]; - unsigned int dim1 = input1.GetShape()[i]; dims[i] = std::max(dim0, dim1); } - TensorShape outShape(numDims, dims.data()); + return std::vector({ TensorShape(numDims, dims.data()) }); +} + +void MultiplicationLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(2, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + GetInputSlot(1).GetConnection()->GetTensorInfo().GetShape() + }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual( "MultiplicationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/MultiplicationLayer.hpp b/src/armnn/layers/MultiplicationLayer.hpp index 48db9f4d01..bbfd1ee694 100644 --- a/src/armnn/layers/MultiplicationLayer.hpp +++ b/src/armnn/layers/MultiplicationLayer.hpp @@ -18,6 +18,7 @@ public: MultiplicationLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; protected: MultiplicationLayer(const char* name); diff --git a/src/armnn/layers/NormalizationLayer.cpp b/src/armnn/layers/NormalizationLayer.cpp index cacd348444..261b16a307 100644 --- a/src/armnn/layers/NormalizationLayer.cpp +++ b/src/armnn/layers/NormalizationLayer.cpp @@ -31,14 +31,16 @@ NormalizationLayer* NormalizationLayer::Clone(Graph& graph) const void NormalizationLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "NormalizationLayer: Input slot must be connected."); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); - const TensorShape& outShape = GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(); ConditionalThrowIfNotEqual( "NormalizationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/OutputLayer.cpp b/src/armnn/layers/OutputLayer.cpp index cadcf2da2f..748f275d74 100644 --- a/src/armnn/layers/OutputLayer.cpp +++ b/src/armnn/layers/OutputLayer.cpp @@ -29,7 +29,7 @@ OutputLayer* OutputLayer::Clone(Graph& graph) const void OutputLayer::ValidateTensorShapesFromInputs() { - // Just validate the input is connected + // Just validates that the input is connected. ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, "OutputLayer: Input slot must be connected."); } diff --git a/src/armnn/layers/PermuteLayer.cpp b/src/armnn/layers/PermuteLayer.cpp index 35692756a1..444de81320 100644 --- a/src/armnn/layers/PermuteLayer.cpp +++ b/src/armnn/layers/PermuteLayer.cpp @@ -31,19 +31,25 @@ PermuteLayer* PermuteLayer::Clone(Graph& graph) const return CloneBase(graph, m_Param, GetName()); } +std::vector PermuteLayer::InferOutputShapes(const std::vector& inputShapes) const +{ + BOOST_ASSERT(inputShapes.size() == 1); + const TensorShape& inShape = inputShapes[0]; + return std::vector ({armnnUtils::Permuted(inShape, m_Param.m_DimMappings)}); +} + void PermuteLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "PermuteLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "PermuteLayer: TensorInfo must be set on connected InputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); - const TensorInfo& infoIn = GetInputSlot(0).GetConnection()->GetTensorInfo(); - TensorShape shapeOut = armnnUtils::Permuted(infoIn.GetShape(), m_Param.m_DimMappings); ConditionalThrowIfNotEqual( "PermuteLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - shapeOut); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/PermuteLayer.hpp b/src/armnn/layers/PermuteLayer.hpp index c060a16390..2700dd2c7b 100644 --- a/src/armnn/layers/PermuteLayer.hpp +++ b/src/armnn/layers/PermuteLayer.hpp @@ -18,6 +18,7 @@ public: PermuteLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; const PermutationVector& GetPermutation() const { diff --git a/src/armnn/layers/Pooling2dLayer.cpp b/src/armnn/layers/Pooling2dLayer.cpp index ede37d7604..68049101e7 100644 --- a/src/armnn/layers/Pooling2dLayer.cpp +++ b/src/armnn/layers/Pooling2dLayer.cpp @@ -29,15 +29,10 @@ Pooling2dLayer* Pooling2dLayer::Clone(Graph& graph) const return CloneBase(graph, m_Param, GetName()); } -void Pooling2dLayer::ValidateTensorShapesFromInputs() +std::vector Pooling2dLayer::InferOutputShapes(const std::vector& inputShapes) const { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "Pooling2dLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "Pooling2dLayer: TensorInfo must be set on connected InputSlot."); - - IOutputSlot* input = GetInputSlot(0).GetConnection(); - const TensorShape& inputShape = input->GetTensorInfo().GetShape(); + BOOST_ASSERT(inputShapes.size() == 1); + const TensorShape& inputShape = inputShapes[0]; // If we support multiple batch dimensions in the future, then this assert will need to change. BOOST_ASSERT_MSG(inputShape.GetNumDimensions() == 4, "Pooling2dLayer will always have 4D input."); @@ -75,8 +70,8 @@ void Pooling2dLayer::ValidateTensorShapesFromInputs() BOOST_ASSERT_MSG(false, "Unsupported Output Shape Rounding"); } - // Make sure that border operations will start from inside the input and not the padded area - // This is what both Caffe and CL does... + // MakeS sure that border operations will start from inside the input and not the padded area. + // This is what both Caffe and CL do... if ((size - 1)*stride >= inSize + lowPad) { --size; @@ -89,18 +84,25 @@ void Pooling2dLayer::ValidateTensorShapesFromInputs() m_Param.m_PaddingMethod, m_Param.m_OutputShapeRounding); outHeight= CalcSize(inHeight, m_Param.m_PadTop, m_Param.m_PadBottom, m_Param.m_PoolHeight, m_Param.m_StrideY, m_Param.m_PaddingMethod, m_Param.m_OutputShapeRounding); - - } unsigned int outChannels = inChannels; unsigned int outBatchSize = inBatchSize; - TensorShape shapeOut({outBatchSize, outChannels, outHeight, outWidth}); + return std::vector({ TensorShape({outBatchSize, outChannels, outHeight, outWidth}) }); +} + +void Pooling2dLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual( "Pooling2dLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - shapeOut); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/Pooling2dLayer.hpp b/src/armnn/layers/Pooling2dLayer.hpp index af39dbb5ec..d5950d6ec3 100644 --- a/src/armnn/layers/Pooling2dLayer.hpp +++ b/src/armnn/layers/Pooling2dLayer.hpp @@ -9,19 +9,20 @@ namespace armnn { -class SoftmaxLayer : public LayerWithParameters +class Pooling2dLayer : public LayerWithParameters { public: virtual std::unique_ptr CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const override; - SoftmaxLayer* Clone(Graph& graph) const override; + Pooling2dLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; protected: - SoftmaxLayer(const SoftmaxDescriptor& param, const char* name); - ~SoftmaxLayer() = default; + Pooling2dLayer(const Pooling2dDescriptor& param, const char* name); + ~Pooling2dLayer() = default; }; } // namespace diff --git a/src/armnn/layers/ReshapeLayer.cpp b/src/armnn/layers/ReshapeLayer.cpp index df5d9d5bb0..248a45c491 100644 --- a/src/armnn/layers/ReshapeLayer.cpp +++ b/src/armnn/layers/ReshapeLayer.cpp @@ -30,17 +30,23 @@ ReshapeLayer* ReshapeLayer::Clone(Graph& graph) const return CloneBase(graph, m_Param, GetName()); } +std::vector ReshapeLayer::InferOutputShapes(const std::vector& inputShapes) const +{ + return std::vector({ m_Param.m_TargetShape }); +} + void ReshapeLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "ReshapeLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "ReshapeLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ }); + + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual( "ReshapeLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - m_Param.m_TargetShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/ReshapeLayer.hpp b/src/armnn/layers/ReshapeLayer.hpp index 8a3cf3a698..4435ba9bf8 100644 --- a/src/armnn/layers/ReshapeLayer.hpp +++ b/src/armnn/layers/ReshapeLayer.hpp @@ -18,6 +18,7 @@ public: ReshapeLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; bool IsEqual(const Layer& other) const { diff --git a/src/armnn/layers/ResizeBilinearLayer.cpp b/src/armnn/layers/ResizeBilinearLayer.cpp index 204d5afae8..6477fa375a 100644 --- a/src/armnn/layers/ResizeBilinearLayer.cpp +++ b/src/armnn/layers/ResizeBilinearLayer.cpp @@ -30,23 +30,31 @@ ResizeBilinearLayer* ResizeBilinearLayer::Clone(Graph& graph) const return CloneBase(graph, m_Param, GetName()); } -void ResizeBilinearLayer::ValidateTensorShapesFromInputs() +std::vector ResizeBilinearLayer::InferOutputShapes(const std::vector& inputShapes) const { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "MemCopyLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "MemCopyLayer: TensorInfo must be set on connected OutputSlot."); + BOOST_ASSERT(inputShapes.size() == 1); + const TensorShape& inputShape = inputShapes[0]; - const TensorShape& inputShape = GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(); unsigned int outWidth = m_Param.m_TargetWidth; unsigned int outHeight = m_Param.m_TargetHeight; unsigned int outChannels = inputShape[1]; unsigned int outBatch = inputShape[0]; - TensorShape outShape({outBatch, outChannels, outHeight, outWidth}); + + return std::vector({ TensorShape({outBatch, outChannels, outHeight, outWidth}) }); +} + +void ResizeBilinearLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual( "ResizeBilinearLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/ResizeBilinearLayer.hpp b/src/armnn/layers/ResizeBilinearLayer.hpp index 2cefedb0b8..e6798ce531 100644 --- a/src/armnn/layers/ResizeBilinearLayer.hpp +++ b/src/armnn/layers/ResizeBilinearLayer.hpp @@ -18,6 +18,7 @@ public: ResizeBilinearLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; protected: ResizeBilinearLayer(const ResizeBilinearDescriptor& param, const char* name); diff --git a/src/armnn/layers/SoftmaxLayer.cpp b/src/armnn/layers/SoftmaxLayer.cpp index 2bd0c1d106..7c42b7a3c9 100644 --- a/src/armnn/layers/SoftmaxLayer.cpp +++ b/src/armnn/layers/SoftmaxLayer.cpp @@ -31,14 +31,16 @@ SoftmaxLayer* SoftmaxLayer::Clone(Graph& graph) const void SoftmaxLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow(GetInputSlot(0).GetConnection() != nullptr, - "SoftmaxLayer: Input slot must be connected."); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); - const TensorShape& outShape = GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(); ConditionalThrowIfNotEqual( "SoftmaxLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/SoftmaxLayer.hpp b/src/armnn/layers/SoftmaxLayer.hpp index ff60a08a91..af39dbb5ec 100644 --- a/src/armnn/layers/SoftmaxLayer.hpp +++ b/src/armnn/layers/SoftmaxLayer.hpp @@ -9,19 +9,19 @@ namespace armnn { -class Pooling2dLayer : public LayerWithParameters +class SoftmaxLayer : public LayerWithParameters { public: virtual std::unique_ptr CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const override; - Pooling2dLayer* Clone(Graph& graph) const override; + SoftmaxLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; protected: - Pooling2dLayer(const Pooling2dDescriptor& param, const char* name); - ~Pooling2dLayer() = default; + SoftmaxLayer(const SoftmaxDescriptor& param, const char* name); + ~SoftmaxLayer() = default; }; } // namespace diff --git a/src/armnn/layers/SplitterLayer.cpp b/src/armnn/layers/SplitterLayer.cpp index 630921e4d8..5e737a245e 100644 --- a/src/armnn/layers/SplitterLayer.cpp +++ b/src/armnn/layers/SplitterLayer.cpp @@ -22,7 +22,7 @@ std::unique_ptr SplitterLayer::CreateWorkload(const Graph& graph, con { SplitterQueueDescriptor descriptor; - // copy the window origins to the descriptor + // Copies the window origins to the descriptor. for (unsigned int i = 0; i < m_Param.GetNumViews(); ++i) { descriptor.m_ViewOrigins.emplace_back( @@ -34,14 +34,14 @@ std::unique_ptr SplitterLayer::CreateWorkload(const Graph& graph, con void SplitterLayer::CreateTensorHandles(Graph& graph, const IWorkloadFactory& factory) { - //if sub tensors are supported than all the "splitter" need to do is to + //If sub tensors are supported than all the "splitter" need to do is to //set the outputs to be appropriate sub tensors of the input. if (factory.SupportsSubTensors()) { const OutputHandler& outputHandler = GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler(); ITensorHandle* inputData = outputHandler.GetData(); - //create the outputs as subtensors of the input + //Creates the outputs as subtensors of the input. for (unsigned int i = 0; i < m_Param.GetNumViews(); ++i) { m_OutputHandlers[i].SetData(factory.CreateSubTensorHandle(*inputData, @@ -63,18 +63,38 @@ SplitterLayer* SplitterLayer::Clone(Graph& graph) const return CloneBase(graph, m_Param, GetName()); } -void SplitterLayer::ValidateTensorShapesFromInputs() +std::vector SplitterLayer::InferOutputShapes(const std::vector& inputShapes) const { + BOOST_ASSERT(inputShapes.size() == m_Param.GetNumViews()); + std::vector outShapes; //Output shapes must match View shapes. for (unsigned int viewIdx = 0; viewIdx < m_Param.GetNumViews(); viewIdx++) { const uint32_t* sizes = m_Param.GetViewSizes(viewIdx); + outShapes.push_back(TensorShape(m_Param.GetNumDimensions(), sizes)); + } + return outShapes; +} + +void SplitterLayer::ValidateTensorShapesFromInputs() +{ + std::vector views; + for (unsigned int viewIdx = 0; viewIdx < m_Param.GetNumViews(); viewIdx++) + { + const uint32_t* sizes = m_Param.GetViewSizes(viewIdx); + views.push_back(TensorShape(m_Param.GetNumDimensions(), sizes)); + } + + auto inferredShapes = InferOutputShapes(views); - TensorShape outShape(m_Param.GetNumDimensions(), sizes); + BOOST_ASSERT(inferredShapes.size() == m_Param.GetNumViews()); + + for (unsigned int viewIdx = 0; viewIdx < m_Param.GetNumViews(); viewIdx++) + { ConditionalThrowIfNotEqual( "SplitterLayer: View sizes must match output tensor shapes.", GetOutputSlot(viewIdx).GetTensorInfo().GetShape(), - outShape); + inferredShapes[viewIdx]); } } diff --git a/src/armnn/layers/SplitterLayer.hpp b/src/armnn/layers/SplitterLayer.hpp index 7e5bbd2668..8e361b4d5c 100644 --- a/src/armnn/layers/SplitterLayer.hpp +++ b/src/armnn/layers/SplitterLayer.hpp @@ -19,6 +19,7 @@ public: SplitterLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector InferOutputShapes(const std::vector& inputShapes) const override; protected: SplitterLayer(const ViewsDescriptor& param, const char* name); diff --git a/src/armnn/memory/BaseMemoryManager.cpp b/src/armnn/memory/BaseMemoryManager.cpp new file mode 100644 index 0000000000..07f42333d6 --- /dev/null +++ b/src/armnn/memory/BaseMemoryManager.cpp @@ -0,0 +1,125 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "BaseMemoryManager.hpp" + +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) +#include "memory/BlobLifetimeManager.hpp" +#include "memory/PoolManager.hpp" +#include "memory/OffsetLifetimeManager.hpp" +#endif + +#include + +namespace armnn +{ + +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) +BaseMemoryManager::BaseMemoryManager(std::unique_ptr alloc, + MemoryAffinity memoryAffinity) +{ + // (Re)create the memory manager components. + m_Allocator = std::move(alloc); + + m_IntraLayerMemoryMgr = CreateArmComputeMemoryManager(memoryAffinity); + m_InterLayerMemoryMgr = CreateArmComputeMemoryManager(memoryAffinity); +} + +std::shared_ptr +BaseMemoryManager::CreateArmComputeMemoryManager(MemoryAffinity memoryAffinity) +{ + std::shared_ptr lifetimeManager = nullptr; + + if (memoryAffinity == MemoryAffinity::Buffer) + { + lifetimeManager = std::make_shared(); + } + else + { + lifetimeManager = std::make_shared(); + } + + auto poolManager = std::make_shared(); + auto memoryManager = std::make_shared(lifetimeManager, poolManager); + + // Set allocator that the memory manager will use + memoryManager->set_allocator(m_Allocator.get()); + + return memoryManager; +} + +void BaseMemoryManager::FinalizeMemoryManager(arm_compute::MemoryManagerOnDemand& memoryManager) +{ + // Number of pools that the manager will create. This specifies how many layers you want to run in parallel + memoryManager.set_num_pools(1); + + // Finalize the memory manager. (Validity checks, memory allocations, etc) + memoryManager.finalize(); +} + +void BaseMemoryManager::Finalize() +{ + BOOST_ASSERT(m_IntraLayerMemoryMgr); + FinalizeMemoryManager(*m_IntraLayerMemoryMgr.get()); + + BOOST_ASSERT(m_InterLayerMemoryMgr); + FinalizeMemoryManager(*m_InterLayerMemoryMgr.get()); +} + +void BaseMemoryManager::Acquire() +{ + // Allocate memory pools for intra-layer memory manager + BOOST_ASSERT(m_IntraLayerMemoryMgr); + IPoolManager* poolManager = boost::polymorphic_downcast(m_IntraLayerMemoryMgr->pool_manager()); + BOOST_ASSERT(poolManager); + poolManager->AllocatePools(); + + // Allocate memory pools for inter-layer memory manager + BOOST_ASSERT(m_InterLayerMemoryMgr); + poolManager = boost::polymorphic_downcast(m_InterLayerMemoryMgr->pool_manager()); + BOOST_ASSERT(poolManager); + poolManager->AllocatePools(); + + // Acquire inter-layer memory group. NOTE: This has to come after allocating the pools + BOOST_ASSERT(m_InterLayerMemoryGroup); + m_InterLayerMemoryGroup->acquire(); +} + +void BaseMemoryManager::Release() +{ + // Release inter-layer memory group. NOTE: This has to come before releasing the pools + BOOST_ASSERT(m_InterLayerMemoryGroup); + m_InterLayerMemoryGroup->release(); + + // Release memory pools managed by intra-layer memory manager + BOOST_ASSERT(m_IntraLayerMemoryMgr); + IPoolManager* poolManager = boost::polymorphic_downcast(m_IntraLayerMemoryMgr->pool_manager()); + BOOST_ASSERT(poolManager); + poolManager->ReleasePools(); + + // Release memory pools managed by inter-layer memory manager + BOOST_ASSERT(m_InterLayerMemoryMgr); + poolManager = boost::polymorphic_downcast(m_InterLayerMemoryMgr->pool_manager()); + BOOST_ASSERT(poolManager); + poolManager->ReleasePools(); +} +#endif + +#ifdef ARMCOMPUTENEON_ENABLED +std::shared_ptr +NeonMemoryManager::CreateMemoryGroup(const std::shared_ptr& memoryManager) +{ + return std::make_shared(memoryManager); +} +#endif + +#ifdef ARMCOMPUTECL_ENABLED +std::shared_ptr +ClMemoryManager::CreateMemoryGroup(const std::shared_ptr& memoryManager) +{ + return std::make_shared(memoryManager); +} +#endif + +} diff --git a/src/armnn/memory/BaseMemoryManager.hpp b/src/armnn/memory/BaseMemoryManager.hpp new file mode 100644 index 0000000000..433d0ea9ad --- /dev/null +++ b/src/armnn/memory/BaseMemoryManager.hpp @@ -0,0 +1,104 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "backends/WorkloadFactory.hpp" + +#ifdef ARMCOMPUTENEON_ENABLED +#include "arm_compute/runtime/MemoryGroup.h" +#endif + +#ifdef ARMCOMPUTECL_ENABLED +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#endif + +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) +#include "arm_compute/runtime/IAllocator.h" +#include "arm_compute/runtime/IMemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" +#endif + +namespace armnn +{ + +class BaseMemoryManager +{ +public: + enum class MemoryAffinity + { + Buffer, + Offset + }; + + BaseMemoryManager() { } + virtual ~BaseMemoryManager() { } + +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) + + BaseMemoryManager(std::unique_ptr alloc, MemoryAffinity memoryAffinity); + + std::shared_ptr& GetIntraLayerManager() { return m_IntraLayerMemoryMgr; } + std::shared_ptr& GetInterLayerManager() { return m_InterLayerMemoryMgr; } + std::shared_ptr& GetInterLayerMemoryGroup() { return m_InterLayerMemoryGroup; } + + void Finalize(); + void Acquire(); + void Release(); + +protected: + + std::unique_ptr m_Allocator; + std::shared_ptr m_IntraLayerMemoryMgr; + std::shared_ptr m_InterLayerMemoryMgr; + std::shared_ptr m_InterLayerMemoryGroup; + + std::shared_ptr CreateArmComputeMemoryManager(MemoryAffinity memoryAffinity); + + virtual std::shared_ptr + CreateMemoryGroup(const std::shared_ptr& memoryManager) = 0; + + void FinalizeMemoryManager(arm_compute::MemoryManagerOnDemand& memoryManager); +#endif +}; + +class NeonMemoryManager : public BaseMemoryManager +{ +public: + NeonMemoryManager() {} + virtual ~NeonMemoryManager() {} + +#ifdef ARMCOMPUTENEON_ENABLED + NeonMemoryManager(std::unique_ptr alloc, MemoryAffinity memoryAffinity) + : BaseMemoryManager(std::move(alloc), memoryAffinity) + { + m_InterLayerMemoryGroup = CreateMemoryGroup(m_InterLayerMemoryMgr); + } + +protected: + virtual std::shared_ptr + CreateMemoryGroup(const std::shared_ptr& memoryManager) override; +#endif +}; + +class ClMemoryManager : public BaseMemoryManager +{ +public: + ClMemoryManager() {} + virtual ~ClMemoryManager() {} + +#ifdef ARMCOMPUTECL_ENABLED + ClMemoryManager(std::unique_ptr alloc) + : BaseMemoryManager(std::move(alloc), MemoryAffinity::Buffer) + { + m_InterLayerMemoryGroup = CreateMemoryGroup(m_InterLayerMemoryMgr); + } + +protected: + virtual std::shared_ptr + CreateMemoryGroup(const std::shared_ptr& memoryManager) override; +#endif +}; + +} //namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/BlobLifetimeManager.cpp b/src/armnn/memory/BlobLifetimeManager.cpp new file mode 100644 index 0000000000..5b085b2f5e --- /dev/null +++ b/src/armnn/memory/BlobLifetimeManager.cpp @@ -0,0 +1,79 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "BlobLifetimeManager.hpp" +#include "BlobMemoryPool.hpp" + +#include "arm_compute/runtime/IMemoryGroup.h" + +#include "boost/assert.hpp" + +#include + +namespace armnn +{ + +BlobLifetimeManager::BlobLifetimeManager() + : m_BlobSizes() +{ +} + +arm_compute::MappingType BlobLifetimeManager::mapping_type() const +{ + return arm_compute::MappingType::BLOBS; +} + +void BlobLifetimeManager::update_blobs_and_mappings() +{ + using namespace arm_compute; + + BOOST_ASSERT(are_all_finalized()); + BOOST_ASSERT(_active_group); + + // Sort free blobs requirements in descending order. + _free_blobs.sort([](const Blob & ba, const Blob & bb) + { + return ba.max_size > bb.max_size; + }); + std::vector groupSizes; + std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(groupSizes), [](const Blob & b) + { + return b.max_size; + }); + + // Update blob sizes + size_t max_size = std::max(m_BlobSizes.size(), groupSizes.size()); + m_BlobSizes.resize(max_size, 0); + groupSizes.resize(max_size, 0); + std::transform(std::begin(m_BlobSizes), std::end(m_BlobSizes), std::begin(groupSizes), + std::begin(m_BlobSizes), [](size_t lhs, size_t rhs) + { + return std::max(lhs, rhs); + }); + + // Calculate group mappings + auto& groupMappings = _active_group->mappings(); + unsigned int blobIdx = 0; + + for(auto& freeBlob : _free_blobs) + { + for(auto& boundElementId : freeBlob.bound_elements) + { + BOOST_ASSERT(_active_elements.find(boundElementId) != std::end(_active_elements)); + + Element& boundElement = _active_elements[boundElementId]; + groupMappings[boundElement.handle] = blobIdx; + } + + ++blobIdx; + } +} + +std::unique_ptr BlobLifetimeManager::create_pool(arm_compute::IAllocator* allocator) +{ + BOOST_ASSERT(allocator); + return std::make_unique(allocator, m_BlobSizes); +} + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/BlobLifetimeManager.hpp b/src/armnn/memory/BlobLifetimeManager.hpp new file mode 100644 index 0000000000..8bb8b326c4 --- /dev/null +++ b/src/armnn/memory/BlobLifetimeManager.hpp @@ -0,0 +1,35 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "arm_compute/runtime/ISimpleLifetimeManager.h" + +namespace armnn +{ + +class BlobLifetimeManager : public arm_compute::ISimpleLifetimeManager +{ +public: + BlobLifetimeManager(); + + BlobLifetimeManager(const BlobLifetimeManager&) = delete; + + BlobLifetimeManager& operator=(const BlobLifetimeManager&) = delete; + + BlobLifetimeManager(BlobLifetimeManager&&) = default; + + BlobLifetimeManager& operator=(BlobLifetimeManager&&) = default; + + std::unique_ptr create_pool(arm_compute::IAllocator* allocator) override; + + arm_compute::MappingType mapping_type() const override; + +private: + void update_blobs_and_mappings() override; + + std::vector m_BlobSizes; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/BlobMemoryPool.cpp b/src/armnn/memory/BlobMemoryPool.cpp new file mode 100644 index 0000000000..c9f44a4dc6 --- /dev/null +++ b/src/armnn/memory/BlobMemoryPool.cpp @@ -0,0 +1,88 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "BlobMemoryPool.hpp" + +#include + +namespace armnn +{ + +BlobMemoryPool::BlobMemoryPool(arm_compute::IAllocator* allocator, std::vector blobSizes) + : m_Allocator(allocator) + , m_Blobs() + , m_BlobSizes(std::move(blobSizes)) + , m_MemoryAllocated(false) +{ + AllocatePool(); +} + +BlobMemoryPool::~BlobMemoryPool() +{ + ReleasePool(); +} + +void BlobMemoryPool::acquire(arm_compute::MemoryMappings& handles) +{ + // Set memory to handlers + for (auto& handle : handles) + { + BOOST_ASSERT(handle.first); + *handle.first = m_Blobs[handle.second]; + } +} + +void BlobMemoryPool::release(arm_compute::MemoryMappings &handles) +{ + for (auto& handle : handles) + { + BOOST_ASSERT(handle.first); + *handle.first = nullptr; + } +} + +arm_compute::MappingType BlobMemoryPool::mapping_type() const +{ + return arm_compute::MappingType::BLOBS; +} + +std::unique_ptr BlobMemoryPool::duplicate() +{ + BOOST_ASSERT(m_Allocator); + return std::make_unique(m_Allocator, m_BlobSizes); +} + +void BlobMemoryPool::AllocatePool() +{ + if (!m_MemoryAllocated) + { + BOOST_ASSERT(m_Allocator); + + for (const auto& blobSize : m_BlobSizes) + { + m_Blobs.push_back(m_Allocator->allocate(blobSize, 0)); + } + + m_MemoryAllocated = true; + } +} + +void BlobMemoryPool::ReleasePool() +{ + if (m_MemoryAllocated) + { + BOOST_ASSERT(m_Allocator); + + for (auto& blob : m_Blobs) + { + m_Allocator->free(blob); + } + + m_Blobs.clear(); + + m_MemoryAllocated = false; + } +} + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/BlobMemoryPool.hpp b/src/armnn/memory/BlobMemoryPool.hpp new file mode 100644 index 0000000000..b17db2ea65 --- /dev/null +++ b/src/armnn/memory/BlobMemoryPool.hpp @@ -0,0 +1,55 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "IMemoryPool.hpp" + +#include "arm_compute/runtime/IAllocator.h" +#include "arm_compute/runtime/Types.h" + +namespace armnn +{ + +/** Blob memory pool */ +class BlobMemoryPool : public IMemoryPool +{ +public: + BlobMemoryPool(arm_compute::IAllocator* allocator, std::vector blobSizes); + + ~BlobMemoryPool(); + + BlobMemoryPool(const BlobMemoryPool&) = delete; + + BlobMemoryPool& operator=(const BlobMemoryPool&) = delete; + + BlobMemoryPool(BlobMemoryPool&&) = default; + + BlobMemoryPool& operator=(BlobMemoryPool&&) = default; + + void acquire(arm_compute::MemoryMappings &handles) override; + void release(arm_compute::MemoryMappings &handles) override; + + arm_compute::MappingType mapping_type() const override; + + std::unique_ptr duplicate() override; + + void AllocatePool() override; + void ReleasePool() override; + +private: + /// Allocator to use for internal allocation + arm_compute::IAllocator* m_Allocator; + + /// Vector holding all the memory blobs + std::vector m_Blobs; + + /// Sizes of each memory blob + std::vector m_BlobSizes; + + /// Flag indicating whether memory has been allocated for the pool + bool m_MemoryAllocated; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/IMemoryPool.hpp b/src/armnn/memory/IMemoryPool.hpp new file mode 100644 index 0000000000..8c73b484c4 --- /dev/null +++ b/src/armnn/memory/IMemoryPool.hpp @@ -0,0 +1,22 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "arm_compute/runtime/IMemoryPool.h" + +namespace armnn +{ + +class IMemoryPool : public arm_compute::IMemoryPool +{ +public: + /// Allocates memory for the entire pool + virtual void AllocatePool() = 0; + + /// Releases all memory associated with the pool + virtual void ReleasePool() = 0; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/IPoolManager.hpp b/src/armnn/memory/IPoolManager.hpp new file mode 100644 index 0000000000..9b06152538 --- /dev/null +++ b/src/armnn/memory/IPoolManager.hpp @@ -0,0 +1,21 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "arm_compute/runtime/IPoolManager.h" + +namespace armnn +{ + +class IPoolManager : public arm_compute::IPoolManager { +public: + // Allocates all pools within the pool manager + virtual void AllocatePools() = 0; + + // Releases all pools within the pool manager + virtual void ReleasePools() = 0; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/OffsetLifetimeManager.cpp b/src/armnn/memory/OffsetLifetimeManager.cpp new file mode 100644 index 0000000000..bcbbb0b793 --- /dev/null +++ b/src/armnn/memory/OffsetLifetimeManager.cpp @@ -0,0 +1,62 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "OffsetLifetimeManager.hpp" +#include "OffsetMemoryPool.hpp" + +#include "arm_compute/runtime/IMemoryGroup.h" + +#include + +#include "boost/assert.hpp" + +namespace armnn +{ + +OffsetLifetimeManager::OffsetLifetimeManager() + : m_BlobSize(0) +{ +} + +std::unique_ptr OffsetLifetimeManager::create_pool(arm_compute::IAllocator* allocator) +{ + BOOST_ASSERT(allocator); + return std::make_unique(allocator, m_BlobSize); +} + +arm_compute::MappingType OffsetLifetimeManager::mapping_type() const +{ + return arm_compute::MappingType::OFFSETS; +} + +void OffsetLifetimeManager::update_blobs_and_mappings() +{ + BOOST_ASSERT(are_all_finalized()); + BOOST_ASSERT(_active_group); + + // Update blob size + size_t maxGroupSize = std::accumulate(std::begin(_free_blobs), std::end(_free_blobs), + static_cast(0), [](size_t s, const Blob& b) + { + return s + b.max_size; + }); + m_BlobSize = std::max(m_BlobSize, maxGroupSize); + + // Calculate group mappings + auto& groupMappings = _active_group->mappings(); + size_t offset = 0; + for(auto& freeBlob : _free_blobs) + { + for(auto& boundElementId : freeBlob.bound_elements) + { + BOOST_ASSERT(_active_elements.find(boundElementId) != std::end(_active_elements)); + Element& boundElement = _active_elements[boundElementId]; + groupMappings[boundElement.handle] = offset; + } + offset += freeBlob.max_size; + BOOST_ASSERT(offset <= m_BlobSize); + } +} + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/OffsetLifetimeManager.hpp b/src/armnn/memory/OffsetLifetimeManager.hpp new file mode 100644 index 0000000000..d6a5698d95 --- /dev/null +++ b/src/armnn/memory/OffsetLifetimeManager.hpp @@ -0,0 +1,37 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "arm_compute/runtime/ISimpleLifetimeManager.h" + +namespace armnn +{ + +class OffsetLifetimeManager : public arm_compute::ISimpleLifetimeManager +{ +public: + OffsetLifetimeManager(); + + OffsetLifetimeManager(const OffsetLifetimeManager&) = delete; + + OffsetLifetimeManager& operator=(const OffsetLifetimeManager&) = delete; + + OffsetLifetimeManager(OffsetLifetimeManager&&) = default; + + OffsetLifetimeManager& operator=(OffsetLifetimeManager&&) = default; + + std::unique_ptr create_pool(arm_compute::IAllocator* allocator) override; + + arm_compute::MappingType mapping_type() const override; + +private: + void update_blobs_and_mappings() override; + +private: + /// Memory blob size + size_t m_BlobSize; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/OffsetMemoryPool.cpp b/src/armnn/memory/OffsetMemoryPool.cpp new file mode 100644 index 0000000000..cae79c0a86 --- /dev/null +++ b/src/armnn/memory/OffsetMemoryPool.cpp @@ -0,0 +1,84 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "OffsetMemoryPool.hpp" + +#include "boost/assert.hpp" + +#include + +namespace armnn +{ + +OffsetMemoryPool::OffsetMemoryPool(arm_compute::IAllocator* allocator, size_t blobSize) + : m_Allocator(allocator) + , m_Blob() + , m_BlobSize(blobSize) + , m_MemoryAllocated(false) +{ + AllocatePool(); +} + +OffsetMemoryPool::~OffsetMemoryPool() +{ + ReleasePool(); +} + +void OffsetMemoryPool::acquire(arm_compute::MemoryMappings& handles) +{ + BOOST_ASSERT(m_Blob); + + // Set memory to handlers + for(auto& handle : handles) + { + BOOST_ASSERT(handle.first); + *handle.first = reinterpret_cast(m_Blob) + handle.second; + } +} + +void OffsetMemoryPool::release(arm_compute::MemoryMappings &handles) +{ + for(auto& handle : handles) + { + BOOST_ASSERT(handle.first); + *handle.first = nullptr; + } +} + +arm_compute::MappingType OffsetMemoryPool::mapping_type() const +{ + return arm_compute::MappingType::OFFSETS; +} + +std::unique_ptr OffsetMemoryPool::duplicate() +{ + BOOST_ASSERT(m_Allocator); + return std::make_unique(m_Allocator, m_BlobSize); +} + +void OffsetMemoryPool::AllocatePool() +{ + if (!m_MemoryAllocated) + { + BOOST_ASSERT(m_Allocator); + m_Blob = m_Allocator->allocate(m_BlobSize, 0); + + m_MemoryAllocated = true; + } +} + +void OffsetMemoryPool::ReleasePool() +{ + if (m_MemoryAllocated) + { + BOOST_ASSERT(m_Allocator); + + m_Allocator->free(m_Blob); + m_Blob = nullptr; + + m_MemoryAllocated = false; + } +} + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/OffsetMemoryPool.hpp b/src/armnn/memory/OffsetMemoryPool.hpp new file mode 100644 index 0000000000..a0391602fb --- /dev/null +++ b/src/armnn/memory/OffsetMemoryPool.hpp @@ -0,0 +1,54 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "IMemoryPool.hpp" + +#include "arm_compute/runtime/IAllocator.h" +#include "arm_compute/runtime/Types.h" + +namespace armnn +{ + +class OffsetMemoryPool : public IMemoryPool +{ +public: + OffsetMemoryPool(arm_compute::IAllocator* allocator, size_t blobSize); + + ~OffsetMemoryPool(); + + OffsetMemoryPool(const OffsetMemoryPool&) = delete; + + OffsetMemoryPool& operator=(const OffsetMemoryPool&) = delete; + + OffsetMemoryPool(OffsetMemoryPool&&) = default; + + OffsetMemoryPool& operator=(OffsetMemoryPool &&) = default; + + void acquire(arm_compute::MemoryMappings& handles) override; + void release(arm_compute::MemoryMappings& handles) override; + + arm_compute::MappingType mapping_type() const override; + + std::unique_ptr duplicate() override; + + void AllocatePool() override; + void ReleasePool() override; + +private: + /// Allocator to use for internal allocation + arm_compute::IAllocator* m_Allocator; + + /// Memory blob + void* m_Blob; + + /// Size of the allocated memory blob + size_t m_BlobSize; + + /// Flag indicating whether memory has been allocated for the pool + bool m_MemoryAllocated; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/PoolManager.cpp b/src/armnn/memory/PoolManager.cpp new file mode 100644 index 0000000000..52cef47476 --- /dev/null +++ b/src/armnn/memory/PoolManager.cpp @@ -0,0 +1,105 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "IMemoryPool.hpp" +#include "PoolManager.hpp" + +#include "boost/assert.hpp" +#include "boost/polymorphic_cast.hpp" + +#include + +namespace armnn +{ + +PoolManager::PoolManager() + : m_FreePools() + , m_OccupiedPools() + , m_Semaphore() + , m_Mutex() +{} + +arm_compute::IMemoryPool *PoolManager::lock_pool() +{ + BOOST_ASSERT_MSG(!(m_FreePools.empty() && m_OccupiedPools.empty()), "Haven't setup any pools"); + + m_Semaphore->wait(); + std::lock_guard lock(m_Mutex); + + BOOST_ASSERT_MSG(!m_FreePools.empty(), "Empty pool must exist as semaphore has been signalled"); + m_OccupiedPools.splice(std::begin(m_OccupiedPools), m_FreePools, std::begin(m_FreePools)); + + return m_OccupiedPools.front().get(); +} + +void PoolManager::unlock_pool(arm_compute::IMemoryPool *pool) +{ + BOOST_ASSERT_MSG(!(m_FreePools.empty() && m_OccupiedPools.empty()), "Haven't setup any pools!"); + + std::lock_guard lock(m_Mutex); + + auto it = std::find_if( + std::begin(m_OccupiedPools), + std::end(m_OccupiedPools), + [pool](const std::unique_ptr &poolIterator) + { + return poolIterator.get() == pool; + } + ); + + BOOST_ASSERT_MSG(it != std::end(m_OccupiedPools), "Pool to be unlocked couldn't be found"); + m_FreePools.splice(std::begin(m_FreePools), m_OccupiedPools, it); + m_Semaphore->signal(); +} + +void PoolManager::register_pool(std::unique_ptr pool) +{ + std::lock_guard lock(m_Mutex); + BOOST_ASSERT_MSG(m_OccupiedPools.empty(), "All pools should be free in order to register a new one"); + + // Set pool + m_FreePools.push_front(std::move(pool)); + + // Update semaphore + m_Semaphore = std::make_unique(m_FreePools.size()); +} + +size_t PoolManager::num_pools() const +{ + std::lock_guard lock(m_Mutex); + + return m_FreePools.size() + m_OccupiedPools.size(); +} + +void PoolManager::AllocatePools() +{ + std::lock_guard lock(m_Mutex); + + for (auto& pool : m_FreePools) + { + boost::polymorphic_downcast(pool.get())->AllocatePool(); + } + + for (auto& pool : m_OccupiedPools) + { + boost::polymorphic_downcast(pool.get())->AllocatePool(); + } +} + +void PoolManager::ReleasePools() +{ + std::lock_guard lock(m_Mutex); + + for (auto& pool : m_FreePools) + { + boost::polymorphic_downcast(pool.get())->ReleasePool(); + } + + for (auto& pool : m_OccupiedPools) + { + boost::polymorphic_downcast(pool.get())->ReleasePool(); + } +} + +} //namespace armnn \ No newline at end of file diff --git a/src/armnn/memory/PoolManager.hpp b/src/armnn/memory/PoolManager.hpp new file mode 100644 index 0000000000..a8a51497aa --- /dev/null +++ b/src/armnn/memory/PoolManager.hpp @@ -0,0 +1,56 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "IPoolManager.hpp" + +#include "arm_compute/runtime/IMemoryPool.h" +#include "arm_compute/core/Error.h" +#include "support/Mutex.h" +#include "support/Semaphore.h" + +#include +#include +#include + +namespace armnn +{ + +class PoolManager : public IPoolManager +{ +public: + PoolManager(); + + PoolManager(const PoolManager &) = delete; + + PoolManager &operator=(const PoolManager &) = delete; + + PoolManager(PoolManager &&) = default; + + PoolManager &operator=(PoolManager &&) = default; + + arm_compute::IMemoryPool *lock_pool() override; + void unlock_pool(arm_compute::IMemoryPool *pool) override; + void register_pool(std::unique_ptr pool) override; + size_t num_pools() const override; + + void AllocatePools() override; + void ReleasePools() override; + +private: + /// List of free pools + std::list> m_FreePools; + + /// List of occupied pools + std::list> m_OccupiedPools; + + /// Semaphore to control the queues + std::unique_ptr m_Semaphore; + + /// Mutex to control access to the queues + mutable arm_compute::Mutex m_Mutex; +}; + +} // namespace armnn \ No newline at end of file diff --git a/src/armnn/optimizations/All.hpp b/src/armnn/optimizations/All.hpp index 70f78d44af..0603d44d31 100644 --- a/src/armnn/optimizations/All.hpp +++ b/src/armnn/optimizations/All.hpp @@ -4,8 +4,11 @@ // #pragma once +#include "ConvertConstants.hpp" #include "OptimizeInversePermutes.hpp" #include "PermuteAsReshape.hpp" #include "OptimizeConsecutiveReshapes.hpp" #include "SquashEqualSiblings.hpp" #include "MovePermuteUp.hpp" +#include "OptimizeInverseConversions.hpp" +#include "ConvertFp32NetworkToFp16.hpp" diff --git a/src/armnn/optimizations/ConvertConstants.hpp b/src/armnn/optimizations/ConvertConstants.hpp new file mode 100644 index 0000000000..d2dd650665 --- /dev/null +++ b/src/armnn/optimizations/ConvertConstants.hpp @@ -0,0 +1,98 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "Optimization.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "Half.hpp" +#include "FloatingPointConverter.hpp" + +namespace armnn +{ +namespace optimizations +{ + +struct Float16ToFloat32 +{ + static void Func(std::unique_ptr& handle) + { + const TensorInfo& info = handle->GetTensorInfo(); + + if (info.GetDataType() == DataType::Float16) + { + std::vector newValues(info.GetNumElements()); + + armnnUtils::FloatingPointConverter::ConvertFloat16To32(handle->GetTensor(), + info.GetNumElements(), + newValues.data()); + + TensorInfo newInfo(info.GetShape(), DataType::Float32); + ConstTensor newInput(newInfo, newValues); + handle.reset(new ScopedCpuTensorHandle(newInput)); + } + } +}; + +struct Float32ToFloat16 +{ + static void Func(std::unique_ptr& handle) + { + const TensorInfo& info = handle->GetTensorInfo(); + + if (info.GetDataType() == DataType::Float32) + { + std::vector newValues(info.GetNumElements()); + + armnnUtils::FloatingPointConverter::ConvertFloat32To16(handle->GetTensor(), + info.GetNumElements(), + newValues.data()); + + TensorInfo newInfo(info.GetShape(), DataType::Float16); + ConstTensor newInput(newInfo, newValues); + handle.reset(new ScopedCpuTensorHandle(newInput)); + } + } +}; + +template +class ConvertConstants : public Optimization +{ +public: + ConvertConstants() = default; + ConvertConstants(const ConvertConstants&) = default; + virtual ~ConvertConstants() = default; + + void Run(Graph& graph, Layer& layer) const override + { + if (Predicate::Test(layer)) + { + layer.OperateOnConstantTensors(Converter::Func); + } + } +protected: +}; + +struct IsFloat32Layer +{ + static bool Test(const Layer& layer) + { + return layer.GetDataType() == DataType::Float32; + } +}; + +struct IsFloat16Layer +{ + static bool Test(const Layer& layer) + { + return layer.GetDataType() == DataType::Float16; + } +}; + +using ConvertConstantsHalfToFloat = ConvertConstants; +using ConvertConstantsFloatToHalf = ConvertConstants; + +} //namespace optimizations +} //namespace armnn diff --git a/src/armnn/optimizations/ConvertFp32NetworkToFp16.hpp b/src/armnn/optimizations/ConvertFp32NetworkToFp16.hpp new file mode 100644 index 0000000000..a4df05c18a --- /dev/null +++ b/src/armnn/optimizations/ConvertFp32NetworkToFp16.hpp @@ -0,0 +1,80 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "Optimization.hpp" +#include "NetworkUtils.hpp" + +namespace armnn +{ +namespace optimizations +{ + +class ConvertFp32NetworkToFp16Impl +{ +public: + + void Run(Graph& graph, Layer& layer) const + { + if(layer.GetType() == LayerType::Input) + { + // if the outputs of this layer are DataType::Float32 + // add a ConvertFloat32ToFloat16 layer after each of the outputs + if (layer.GetDataType() == DataType::Float32) + { + InsertConvertFp32ToFp16LayersAfter(graph, layer); + } + } + else if (layer.GetType() == LayerType::Output) + { + // if the inputs of this layer are DataType::Float32 + // add a ConvertFloat16ToFloat32 layer before each of the inputs + if (layer.GetDataType() == DataType::Float32) + { + InsertConvertFp16ToFp32LayersBefore(graph, layer); + } + } + else if (layer.GetType() != LayerType::ConvertFp32ToFp16 && layer.GetType() != LayerType::ConvertFp16ToFp32) + { + // if the inputs/outputs of this layer are DataType::Float32 + // change the data type for all inputs and outputs to DataType::Float16 + for (auto&& input = layer.BeginInputSlots(); input != layer.EndInputSlots(); ++input) + { + // if it is connected to OutputSlot of the InputLayer do not change the DataType of connection + // InputSlots of the current layer will be updated when conversion layer is inserted after InputLayer + Layer& base = input->GetConnectedOutputSlot()->GetOwningLayer(); + if (base.GetType() != LayerType::Input) + { + TensorInfo convertInfo = input->GetConnection()->GetTensorInfo(); + if (convertInfo.GetDataType() == DataType::Float32) + { + convertInfo.SetDataType(DataType::Float16); + input->GetConnection()->SetTensorInfo(convertInfo); + } + } + } + + // change outputs to DataType::Float16 + for (auto&& output = layer.BeginOutputSlots(); output != layer.EndOutputSlots(); ++output) + { + TensorInfo convertInfo = output->GetTensorInfo(); + if (convertInfo.GetDataType() == DataType::Float32) + { + convertInfo.SetDataType(DataType::Float16); + output->SetTensorInfo(convertInfo); + } + } + } + } + +protected: + ConvertFp32NetworkToFp16Impl() = default; + ~ConvertFp32NetworkToFp16Impl() = default; +}; + +using Fp32NetworkToFp16Converter = OptimizeForType; + +} // namespace optimizations +} // namespace armnn diff --git a/src/armnn/optimizations/MovePermuteUp.hpp b/src/armnn/optimizations/MovePermuteUp.hpp index 8c59986762..a8e18f5add 100644 --- a/src/armnn/optimizations/MovePermuteUp.hpp +++ b/src/armnn/optimizations/MovePermuteUp.hpp @@ -31,24 +31,24 @@ public: auto permute = boost::polymorphic_downcast(&connection.GetOwningLayer()); const PermutationVector& perm = permute->GetPermutation(); - // Insert an equivalent permute before every input of the base layer. + // Inserts an equivalent permute before every input of the base layer. for (auto baseInput = base.BeginInputSlots(); baseInput != base.EndInputSlots(); ++baseInput) { - // Insert new permute layer. + // Inserts a new permute layer. const std::string name = std::string("moved_up-") + permute->GetName(); PermuteLayer& permLayer = *graph.InsertNewLayer(*baseInput, perm, name.c_str()); - // Set output tensor info for the new layer. + // Sets output tensor info for the new layer. OutputSlot& parentOutput = *permLayer.GetInputSlot(0).GetConnectedOutputSlot(); const TensorInfo permOutInfo = armnnUtils::Permuted(parentOutput.GetTensorInfo(), perm); permLayer.GetOutputHandler().SetTensorInfo(permOutInfo); } - // Set permuted output tensor info + // Sets permuted output tensor info const TensorInfo& childOutInfo = permute->GetOutputHandler().GetTensorInfo(); base.GetOutputHandler().SetTensorInfo(childOutInfo); - // Bypass permute. It will be removed as it's left unconnected. + // Bypasses permute. It will be removed as it's left unconnected. permute->GetOutputSlot().MoveAllConnections(base.GetOutputSlot()); } } diff --git a/src/armnn/optimizations/Optimization.hpp b/src/armnn/optimizations/Optimization.hpp index f81071891b..ee4f91d842 100644 --- a/src/armnn/optimizations/Optimization.hpp +++ b/src/armnn/optimizations/Optimization.hpp @@ -13,9 +13,10 @@ namespace armnn class Optimization { public: + Optimization() = default; + virtual ~Optimization() = default; virtual void Run(Graph& graph, Layer& base) const = 0; protected: - ~Optimization() = default; }; // Wrappers @@ -44,7 +45,7 @@ protected: ~OptimizeForTypeImpl() = default; }; -/// Specialization that calls Wrapped::Run() for any layer type +/// Specialization that calls Wrapped::Run() for any layer type. template class OptimizeForTypeImpl : public armnn::Optimization, public Wrapped { @@ -90,7 +91,7 @@ public: } } - // Remove unconnected children + // Removes unconnected children. for (unsigned int i = 0; i < output->GetNumConnections();) { Layer* child = &output->GetConnection(i)->GetOwningLayer(); diff --git a/src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp b/src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp index 9a926a57a4..935186d32e 100644 --- a/src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp +++ b/src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp @@ -31,19 +31,19 @@ public: if (inInfo.GetShape() != outInfo.GetShape()) { - // Insert equivalent reshape before base layer + // Inserts equivalent reshape before base layer. const std::string name = std::string("merged-") + base.GetName() + std::string("-with-") + child.GetName(); const ReshapeDescriptor descriptor{outInfo.GetShape()}; auto& newReshape = *graph.InsertNewLayer(base.GetInputSlot(0), descriptor, name.c_str()); - // Set tensor info for new layer + // Sets tensor info for new layer. newReshape.GetOutputHandler().SetTensorInfo(outInfo); - // Reconnect base with original parent + // Reconnects base with original parent. newReshape.GetOutputSlot().MoveAllConnections(*parentOut); - // Parent is now the new layer + // Parent is now the new layer. parentOut = &newReshape.GetOutputSlot(); } - // Move connections in child output to parent layer. + // Moves connections in child output to parent layer. // Child layer will be removed as it's left unconnected. // Base layer will be removed if left unconnected. child.GetOutputSlot().MoveAllConnections(*parentOut); diff --git a/src/armnn/optimizations/OptimizeInverseConversions.hpp b/src/armnn/optimizations/OptimizeInverseConversions.hpp new file mode 100644 index 0000000000..5089d63f2f --- /dev/null +++ b/src/armnn/optimizations/OptimizeInverseConversions.hpp @@ -0,0 +1,44 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "Optimization.hpp" + +namespace armnn +{ +namespace optimizations +{ + +class OptimizeInverseConversionsImpl +{ +public: + /// Run for every connection between two inverse data type conversion layers, i.e. + /// Fp16ToFp32 followed by Fp32ToFp16 or vice-versa. + void Run(Graph& graph, InputSlot& connection) const + { + Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer(); + Layer& child = connection.GetOwningLayer(); + + BOOST_ASSERT((base.GetType() == LayerType::ConvertFp16ToFp32 && + child.GetType() == LayerType::ConvertFp32ToFp16) || + (base.GetType() == LayerType::ConvertFp32ToFp16 && + child.GetType() == LayerType::ConvertFp16ToFp32)); + + // Bypass both conversion layers + child.GetOutputSlot().MoveAllConnections(*base.GetInputSlot(0).GetConnectedOutputSlot()); + } + +protected: + OptimizeInverseConversionsImpl() = default; + ~OptimizeInverseConversionsImpl() = default; +}; + +using OptimizeInverseConversionsFp16 = + OptimizeForConnection; +using OptimizeInverseConversionsFp32 = + OptimizeForConnection; + +} // namespace optimizations +} // namespace armnn diff --git a/src/armnn/optimizations/PermuteAsReshape.hpp b/src/armnn/optimizations/PermuteAsReshape.hpp index a8e4c2df5e..736cd5dc98 100644 --- a/src/armnn/optimizations/PermuteAsReshape.hpp +++ b/src/armnn/optimizations/PermuteAsReshape.hpp @@ -23,7 +23,7 @@ public: const std::string name = std::string("as_reshape-") + permute.GetName(); const ReshapeDescriptor descriptor{outInfo.GetShape()}; - // Insert so layers don't need to be re-sorted + // Inserts NewLayer so layers don't need to be re-sorted. auto reshape = graph.InsertNewLayer(permute.GetInputSlot(0), descriptor, name.c_str()); reshape->GetOutputHandler().SetTensorInfo(outInfo); diff --git a/src/armnn/optimizations/SquashEqualSiblings.hpp b/src/armnn/optimizations/SquashEqualSiblings.hpp index c5ce28e723..6e0fa78e4e 100644 --- a/src/armnn/optimizations/SquashEqualSiblings.hpp +++ b/src/armnn/optimizations/SquashEqualSiblings.hpp @@ -41,7 +41,7 @@ public: { std::swap(sibling, lowestPriorityChild); } - // Bypass sibling. It will be removed as it's left unconnected. + // Bypasses sibling. It will be removed as it's left unconnected. auto siblingOut = sibling->BeginOutputSlots(); for (auto lowestPriorityChildOut = lowestPriorityChild->BeginOutputSlots(); lowestPriorityChildOut != lowestPriorityChild->EndOutputSlots(); ++lowestPriorityChildOut) diff --git a/src/armnn/test/CreateWorkload.hpp b/src/armnn/test/CreateWorkload.hpp index c3f4b8a1bf..ee0c584b13 100644 --- a/src/armnn/test/CreateWorkload.hpp +++ b/src/armnn/test/CreateWorkload.hpp @@ -22,7 +22,7 @@ namespace using namespace std; -// Calls CreateWorkload for a layer, and checks the returned pointer is of the correct type +// Calls CreateWorkload for a layer, and checks the returned pointer is of the correct type. template std::unique_ptr MakeAndCheckWorkload(Layer& layer, Graph& graph, const IWorkloadFactory& factory) { @@ -30,18 +30,19 @@ std::unique_ptr MakeAndCheckWorkload(Layer& layer, Graph& graph, const BOOST_TEST(workload.get() == boost::polymorphic_downcast(workload.get()), "Cannot convert to derived class"); std::string reasonIfUnsupported; + layer.SetComputeDevice(factory.GetCompute()); BOOST_TEST(factory.IsLayerSupported(layer, layer.GetDataType(), reasonIfUnsupported)); return std::unique_ptr(static_cast(workload.release())); } -// connects two layers +// Connects two layers. void Connect(Layer* from, Layer* to, const TensorInfo& tensorInfo, unsigned int fromIndex = 0, unsigned int toIndex = 0) { from->GetOutputSlot(fromIndex).Connect(to->GetInputSlot(toIndex)); from->GetOutputHandler(fromIndex).SetTensorInfo(tensorInfo); } -// helper function to create tensor handlers for workloads, assuming they all use the same factory +// Helper function to create tensor handlers for workloads, assuming they all use the same factory. void CreateTensorHandles(armnn::Graph& graph, armnn::IWorkloadFactory& factory) { for (auto&& layer : graph.TopologicalSort()) @@ -57,11 +58,11 @@ void CreateTensorHandles(armnn::Graph& graph, armnn::IWorkloadFactory& factory) // They return the created workloads so that backend-specific checks can be performed. ///////////////////////////////////////////////////////////////////////////////////////////// -template +template std::unique_ptr CreateActivationWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. ActivationDescriptor layerDesc; layerDesc.m_Function = ActivationFunction::Abs; layerDesc.m_A = 3.5f; @@ -69,19 +70,19 @@ std::unique_ptr CreateActivationWorkloadTest(armnn::IWorkloa ActivationLayer* const layer = graph.AddLayer(layerDesc, "layer"); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - armnn::TensorInfo tensorInfo({1, 1}, ActivationWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo tensorInfo({1, 1}, DataType); Connect(input, layer, tensorInfo); Connect(layer, output, tensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); ActivationQueueDescriptor queueDescriptor = workload->GetData(); @@ -91,51 +92,51 @@ std::unique_ptr CreateActivationWorkloadTest(armnn::IWorkloa BOOST_TEST(queueDescriptor.m_Parameters.m_B == -10.0f); BOOST_TEST((queueDescriptor.m_Parameters.m_Function == ActivationFunction::Abs)); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template +template std::unique_ptr CreateAdditionWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Layer* const layer = graph.AddLayer("layer"); - // create extra layers + // Creates extra layers. Layer* const input1 = graph.AddLayer(1, "input1"); Layer* const input2 = graph.AddLayer(2, "input2"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - armnn::TensorInfo tensorInfo({2, 3}, AdditionWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo tensorInfo({2, 3}, DataType); Connect(input1, layer, tensorInfo, 0, 0); Connect(input2, layer, tensorInfo, 0, 1); Connect(layer, output, tensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); AdditionQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 2); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template +template std::unique_ptr CreateBatchNormalizationWorkloadTest( armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. BatchNormalizationDescriptor layerDesc; layerDesc.m_Eps = 0.05f; BatchNormalizationLayer* const layer = graph.AddLayer(layerDesc, "layer"); - armnn::TensorInfo weightInfo({3}, armnn::DataType::Float32); + armnn::TensorInfo weightInfo({3}, DataType); layer->m_Mean = std::make_unique(weightInfo); layer->m_Variance = std::make_unique(weightInfo); layer->m_Beta = std::make_unique(weightInfo); @@ -145,37 +146,37 @@ std::unique_ptr CreateBatchNormalizationWorkl layer->m_Beta->Allocate(); layer->m_Gamma->Allocate(); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - armnn::TensorInfo tensorInfo({2, 3, 1, 1}, armnn::DataType::Float32); + // Connects up. + armnn::TensorInfo tensorInfo({2, 3, 1, 1}, DataType); Connect(input, layer, tensorInfo); Connect(layer, output, tensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); BatchNormalizationQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Parameters.m_Eps == 0.05f); BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - BOOST_TEST((queueDescriptor.m_Mean->GetTensorInfo() == TensorInfo({3}, DataType::Float32))); - BOOST_TEST((queueDescriptor.m_Variance->GetTensorInfo() == TensorInfo({3}, DataType::Float32))); - BOOST_TEST((queueDescriptor.m_Gamma->GetTensorInfo() == TensorInfo({3}, DataType::Float32))); - BOOST_TEST((queueDescriptor.m_Beta->GetTensorInfo() == TensorInfo({3}, DataType::Float32))); + BOOST_TEST((queueDescriptor.m_Mean->GetTensorInfo() == TensorInfo({3}, DataType))); + BOOST_TEST((queueDescriptor.m_Variance->GetTensorInfo() == TensorInfo({3}, DataType))); + BOOST_TEST((queueDescriptor.m_Gamma->GetTensorInfo() == TensorInfo({3}, DataType))); + BOOST_TEST((queueDescriptor.m_Beta->GetTensorInfo() == TensorInfo({3}, DataType))); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template +template std::unique_ptr CreateConvolution2dWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Convolution2dDescriptor layerDesc; layerDesc.m_PadLeft = 3; layerDesc.m_PadRight = 3; @@ -187,24 +188,22 @@ std::unique_ptr CreateConvolution2dWorkloadTest(armnn::IW Convolution2dLayer* const layer = graph.AddLayer(layerDesc, "layer"); - layer->m_Weight = std::make_unique(TensorInfo({2, 3, 5, 3}, - Convolution2dWorkload::ms_DataType)); - layer->m_Bias = std::make_unique - (TensorInfo({2}, GetBiasDataType(Convolution2dWorkload::ms_DataType))); + layer->m_Weight = std::make_unique(TensorInfo({2, 3, 5, 3}, DataType)); + layer->m_Bias = std::make_unique(TensorInfo({2}, GetBiasDataType(DataType))); layer->m_Weight->Allocate(); layer->m_Bias->Allocate(); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - Connect(input, layer, TensorInfo({2, 3, 8, 16}, Convolution2dWorkload::ms_DataType)); - Connect(layer, output, TensorInfo({2, 2, 2, 10}, Convolution2dWorkload::ms_DataType)); + // Connecst up. + Connect(input, layer, TensorInfo({2, 3, 8, 16}, DataType)); + Connect(layer, output, TensorInfo({2, 2, 2, 10}, DataType)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); @@ -218,20 +217,123 @@ std::unique_ptr CreateConvolution2dWorkloadTest(armnn::IW BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({2, 3, 5, 3}, - Convolution2dWorkload::ms_DataType))); + BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({2, 3, 5, 3}, DataType))); BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == - TensorInfo({2}, GetBiasDataType(Convolution2dWorkload::ms_DataType)))); + TensorInfo({2}, GetBiasDataType(DataType)))); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template +template +std::unique_ptr CreateLstmWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) +{ + // This parameter setting is for withCifgWithPeepholeNoProjection + LstmDescriptor layerDesc; + layerDesc.m_ActivationFunc = 4; + layerDesc.m_ClippingThresCell = 0.0f; + layerDesc.m_ClippingThresProj = 0.0f; + layerDesc.m_CifgEnabled = true; + layerDesc.m_PeepholeEnabled = true; + layerDesc.m_ProjectionEnabled = false; + + LstmLayer* const layer = graph.AddLayer(layerDesc, "layer"); + unsigned int batchSize = 2; + unsigned int inputSize = 2; + unsigned int numUnits = 4; + unsigned int outputSize = 4; + + layer->m_BasicParameters.m_InputToForgetWeights = std::make_unique + (TensorInfo({ numUnits, inputSize }, DataType::Float32)); + layer->m_BasicParameters.m_InputToCellWeights = std::make_unique + (TensorInfo({ numUnits, inputSize }, DataType::Float32)); + layer->m_BasicParameters.m_InputToOutputWeights = std::make_unique + (TensorInfo({ numUnits, inputSize }, DataType::Float32)); + layer->m_BasicParameters.m_RecurrentToForgetWeights = std::make_unique + (TensorInfo({ numUnits, outputSize }, DataType::Float32)); + layer->m_BasicParameters.m_RecurrentToCellWeights = std::make_unique + (TensorInfo({ numUnits, outputSize }, DataType::Float32)); + layer->m_BasicParameters.m_RecurrentToOutputWeights = std::make_unique + (TensorInfo({ numUnits, outputSize }, DataType::Float32)); + layer->m_BasicParameters.m_ForgetGateBias = std::make_unique + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_BasicParameters.m_CellBias = std::make_unique + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_BasicParameters.m_OutputGateBias = std::make_unique + (TensorInfo({ numUnits }, DataType::Float32)); + + layer->m_BasicParameters.m_InputToForgetWeights->Allocate(); + layer->m_BasicParameters.m_InputToCellWeights->Allocate(); + layer->m_BasicParameters.m_InputToOutputWeights->Allocate(); + layer->m_BasicParameters.m_RecurrentToForgetWeights->Allocate(); + layer->m_BasicParameters.m_RecurrentToCellWeights->Allocate(); + layer->m_BasicParameters.m_RecurrentToOutputWeights->Allocate(); + layer->m_BasicParameters.m_ForgetGateBias->Allocate(); + layer->m_BasicParameters.m_CellBias->Allocate(); + layer->m_BasicParameters.m_OutputGateBias->Allocate(); + + + if (layerDesc.m_PeepholeEnabled) + { + layer->m_PeepholeParameters.m_CellToForgetWeights = std::make_unique + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_PeepholeParameters.m_CellToOutputWeights = std::make_unique + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_PeepholeParameters.m_CellToForgetWeights->Allocate(); + layer->m_PeepholeParameters.m_CellToOutputWeights->Allocate(); + } + + // create input and output layers + Layer* const input = graph.AddLayer(0, "input"); + Layer* const outputStateIn = graph.AddLayer(1, "outputStateIn"); + Layer* const cellStateIn = graph.AddLayer(2, "cellStateIn"); + Layer* const scratchBuffer = graph.AddLayer(0, "scratchBuffer"); + Layer* const outputStateOut = graph.AddLayer(1, "outputStateOut"); + Layer* const cellStateOut = graph.AddLayer(2, "cellStateOut"); + Layer* const output = graph.AddLayer(3, "output"); + + // connect up + armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32); + armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32); + armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32); + armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32); + if (layerDesc.m_CifgEnabled) + { + lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 }); + } + + Connect(input, layer, lstmTensorInfo1, 0, 0); + Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1); + Connect(outputStateIn, layer, lstmTensorInfo3, 0, 2); + Connect(layer, scratchBuffer, lstmTensorInfoScratchBuff, 0, 0); + Connect(layer, outputStateOut, lstmTensorInfo3, 1, 0); + Connect(layer, cellStateOut, lstmTensorInfo2, 2, 0); + Connect(layer, output, lstmTensorInfo3, 3, 0); + + CreateTensorHandles(graph, factory); + + // make the workload and check it + auto workload = MakeAndCheckWorkload(*layer, graph, factory); + LstmQueueDescriptor queueDescriptor = workload->GetData(); + BOOST_TEST(queueDescriptor.m_Parameters.m_ActivationFunc == 4); + BOOST_TEST(queueDescriptor.m_Parameters.m_ClippingThresCell == 0.0f); + BOOST_TEST(queueDescriptor.m_Parameters.m_ClippingThresProj == 0.0f); + BOOST_TEST(queueDescriptor.m_Inputs.size() == 3); + BOOST_TEST(queueDescriptor.m_Outputs.size() == 4); + + BOOST_TEST((queueDescriptor.m_InputToForgetWeights->GetTensorInfo() == TensorInfo({ numUnits, inputSize }, + DataType::Float32))); + BOOST_TEST((queueDescriptor.m_OutputGateBias->GetTensorInfo() == TensorInfo({ numUnits }, + DataType::Float32))); + BOOST_TEST((queueDescriptor.m_CellBias->GetTensorInfo() == TensorInfo({ numUnits }, DataType::Float32))); + return workload; +} + +template std::unique_ptr CreateDirectConvolution2dWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Convolution2dDescriptor layerDesc; layerDesc.m_PadLeft = 1; layerDesc.m_PadRight = 1; @@ -243,26 +345,25 @@ std::unique_ptr CreateDirectConvolution2dWorkloadTest(arm Convolution2dLayer* const layer = graph.AddLayer(layerDesc, "layer"); - float inputsQScale = Convolution2dWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 1.0f : 0.0; - float outputQScale = Convolution2dWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 2.0f : 0.0; + float inputsQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 1.0f : 0.0; + float outputQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 2.0f : 0.0; - layer->m_Weight = std::make_unique(TensorInfo({ 2, 3, 3, 3 }, - Convolution2dWorkload::ms_DataType, inputsQScale)); + layer->m_Weight = std::make_unique(TensorInfo({ 2, 3, 3, 3 }, DataType, inputsQScale)); layer->m_Bias = std::make_unique - (TensorInfo({2}, GetBiasDataType(Convolution2dWorkload::ms_DataType), inputsQScale)); + (TensorInfo({2}, GetBiasDataType(DataType), inputsQScale)); layer->m_Weight->Allocate(); layer->m_Bias->Allocate(); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - Connect(input, layer, TensorInfo({2, 3, 6, 6}, Convolution2dWorkload::ms_DataType, inputsQScale)); - Connect(layer, output, TensorInfo({2, 2, 6, 6}, Convolution2dWorkload::ms_DataType, outputQScale)); + // Connects up. + Connect(input, layer, TensorInfo({2, 3, 6, 6}, DataType, inputsQScale)); + Connect(layer, output, TensorInfo({2, 2, 6, 6}, DataType, outputQScale)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); @@ -277,11 +378,11 @@ std::unique_ptr CreateDirectConvolution2dWorkloadTest(arm BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({2, 3, 3, 3}, - Convolution2dWorkload::ms_DataType, inputsQScale))); + DataType, inputsQScale))); BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() - == TensorInfo({2}, GetBiasDataType(Convolution2dWorkload::ms_DataType), inputsQScale))); + == TensorInfo({2}, GetBiasDataType(DataType), inputsQScale))); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } @@ -289,7 +390,7 @@ template std::unique_ptr CreateDepthwiseConvolution2dWorkloadTest( armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. DepthwiseConvolution2dDescriptor layerDesc; layerDesc.m_PadLeft = 3; layerDesc.m_PadRight = 3; @@ -306,16 +407,16 @@ std::unique_ptr CreateDepthwiseConvolutio layer->m_Weight->Allocate(); layer->m_Bias->Allocate(); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up + // Connects up. Connect(input, layer, TensorInfo({2, 3, 8, 16}, armnn::DataType::Float32)); Connect(layer, output, TensorInfo({2, 9, 2, 10}, armnn::DataType::Float32)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); DepthwiseConvolution2dQueueDescriptor queueDescriptor = workload->GetData(); @@ -332,41 +433,39 @@ std::unique_ptr CreateDepthwiseConvolutio BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({3, 3, 5, 3}, DataType::Float32))); BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == TensorInfo({9}, DataType::Float32))); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template +template std::unique_ptr CreateFullyConnectedWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. FullyConnectedDescriptor layerDesc; layerDesc.m_BiasEnabled = true; layerDesc.m_TransposeWeightMatrix = true; FullyConnectedLayer* const layer = graph.AddLayer(layerDesc, "layer"); - float inputsQScale = FullyConnectedWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 1.0f : 0.0; - float outputQScale = FullyConnectedWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 2.0f : 0.0; + float inputsQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 1.0f : 0.0; + float outputQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 2.0f : 0.0; - layer->m_Weight = std::make_unique(TensorInfo({7, 20}, - FullyConnectedWorkload::ms_DataType, inputsQScale, 0)); - layer->m_Bias = std::make_unique(TensorInfo({7}, - GetBiasDataType(FullyConnectedWorkload::ms_DataType), inputsQScale)); + layer->m_Weight = std::make_unique(TensorInfo({7, 20}, DataType, inputsQScale, 0)); + layer->m_Bias = std::make_unique(TensorInfo({7}, GetBiasDataType(DataType), inputsQScale)); layer->m_Weight->Allocate(); layer->m_Bias->Allocate(); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - Connect(input, layer, TensorInfo({3, 1, 4, 5}, FullyConnectedWorkload::ms_DataType, inputsQScale)); - Connect(layer, output, TensorInfo({3, 7}, FullyConnectedWorkload::ms_DataType, outputQScale)); + // Connects up. + Connect(input, layer, TensorInfo({3, 1, 4, 5}, DataType, inputsQScale)); + Connect(layer, output, TensorInfo({3, 7}, DataType, outputQScale)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); FullyConnectedQueueDescriptor queueDescriptor = workload->GetData(); @@ -375,50 +474,48 @@ std::unique_ptr CreateFullyConnectedWorkloadTest(armnn:: BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == - TensorInfo({7, 20}, FullyConnectedWorkload::ms_DataType, inputsQScale))); - BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == - TensorInfo({7}, GetBiasDataType(FullyConnectedWorkload::ms_DataType), inputsQScale))); + BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({7, 20}, DataType, inputsQScale))); + BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == TensorInfo({7}, GetBiasDataType(DataType), inputsQScale))); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template +template std::unique_ptr CreateMultiplicationWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Layer* const layer = graph.AddLayer("layer"); - // create extra layers + // Creates extra layers. Layer* const input1 = graph.AddLayer(1, "input1"); Layer* const input2 = graph.AddLayer(2, "input2"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - armnn::TensorInfo tensorInfo({2, 3}, MultiplicationWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo tensorInfo({2, 3}, DataType); Connect(input1, layer, tensorInfo, 0, 0); Connect(input2, layer, tensorInfo, 0, 1); Connect(layer, output, tensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); MultiplicationQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 2); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template +template std::unique_ptr CreateNormalizationWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. NormalizationDescriptor layerDesc; layerDesc.m_NormChannelType = NormalizationAlgorithmChannel::Across; layerDesc.m_NormMethodType = NormalizationAlgorithmMethod::LocalBrightness; @@ -429,16 +526,16 @@ std::unique_ptr CreateNormalizationWorkloadTest(ar NormalizationLayer* layer = graph.AddLayer(layerDesc, "layer"); - // create extra layers + // Creatse extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - Connect(input, layer, TensorInfo({3, 5, 5, 1}, armnn::DataType::Float32)); - Connect(layer, output, TensorInfo({3, 5, 5, 1}, armnn::DataType::Float32)); + // Connects up. + Connect(input, layer, TensorInfo({3, 5, 5, 1}, DataType)); + Connect(layer, output, TensorInfo({3, 5, 5, 1}, DataType)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); NormalizationQueueDescriptor queueDescriptor = workload->GetData(); @@ -452,15 +549,15 @@ std::unique_ptr CreateNormalizationWorkloadTest(ar BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template +template std::unique_ptr CreatePooling2dWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Pooling2dDescriptor layerDesc; layerDesc.m_PoolType = PoolingAlgorithm::Average; layerDesc.m_PoolWidth = 3; @@ -475,16 +572,16 @@ std::unique_ptr CreatePooling2dWorkloadTest(armnn::IWorkloadF Pooling2dLayer* const layer = graph.AddLayer(layerDesc, "layer"); - // create extra layers + // Create extra layers Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - Connect(input, layer, TensorInfo({3, 2, 5, 5}, Pooling2dWorkload::ms_DataType)); - Connect(layer, output, TensorInfo({3, 2, 2, 4}, Pooling2dWorkload::ms_DataType)); + // Connect up + Connect(input, layer, TensorInfo({3, 2, 5, 5}, DataType)); + Connect(layer, output, TensorInfo({3, 2, 2, 4}, DataType)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Make the workload and checks it auto workload = MakeAndCheckWorkload(*layer, graph, factory); Pooling2dQueueDescriptor queueDescriptor = workload->GetData(); @@ -502,70 +599,70 @@ std::unique_ptr CreatePooling2dWorkloadTest(armnn::IWorkloadF BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Return so we can do extra, backend-specific tests return workload; } -template +template std::unique_ptr CreateSoftmaxWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Create the layer we're testing. SoftmaxDescriptor softmaxDescriptor; Layer* const layer = graph.AddLayer(softmaxDescriptor, "layer"); - // create extra layers + // Create extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - armnn::TensorInfo tensorInfo({4, 1}, SoftmaxWorkload::ms_DataType); + // Connect up + armnn::TensorInfo tensorInfo({4, 1}, DataType); Connect(input, layer, tensorInfo); Connect(layer, output, tensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Make the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); SoftmaxQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Return so we can do extra, backend-specific tests. return workload; } -template +template std::unique_ptr CreateSplitterWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Create the layer we're testing. // NOTE: need three dimensions channels, height/y, width/x because the Compute // library restricts subtensors to have the same x and y dimensions as // their parent tensors, and therefore the origin on the x and y dimension // has to be zero for any view. So we need a third dimension to split... - // NOTE: arguments are: number of views, number of dimensions + // NOTE: arguments are: number of views, number of dimensions. ViewsDescriptor layerDesc(3, 3); - // NOTE: arguments are: view, dimension, value + // NOTE: arguments are: view, dimension, value. layerDesc.SetViewOriginCoord(0, 0, 0); layerDesc.SetViewOriginCoord(1, 0, 1); layerDesc.SetViewOriginCoord(2, 0, 3); Layer* const layer = graph.AddLayer(layerDesc, "layer"); - // add extra layers + // Adds extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output0 = graph.AddLayer(0, "output0"); Layer* const output1 = graph.AddLayer(1, "output1"); Layer* const output2 = graph.AddLayer(2, "output2"); - // connect up - armnn::TensorInfo tensorInfo({5, 7, 7}, SplitterWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo tensorInfo({5, 7, 7}, DataType); Connect(input, layer, tensorInfo); - armnn::TensorInfo output0Info({1, 7, 7}, SplitterWorkload::ms_DataType); - armnn::TensorInfo output1Info({2, 7, 7}, SplitterWorkload::ms_DataType); - armnn::TensorInfo output2Info({2, 7, 7}, SplitterWorkload::ms_DataType); + armnn::TensorInfo output0Info({1, 7, 7}, DataType); + armnn::TensorInfo output1Info({2, 7, 7}, DataType); + armnn::TensorInfo output2Info({2, 7, 7}, DataType); Connect(layer, output0, output0Info, 0, 0); Connect(layer, output1, output1Info, 1, 0); @@ -573,7 +670,7 @@ std::unique_ptr CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); SplitterQueueDescriptor queueDescriptor = workload->GetData(); @@ -591,24 +688,21 @@ std::unique_ptr BOOST_TEST(queueDescriptor.m_ViewOrigins[1].m_Origin[2] == 0); BOOST_TEST(queueDescriptor.m_ViewOrigins[2].m_Origin[2] == 0); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -/// This function constructs a graph with both a splitter and a merger, and returns a pair of the workloads -template +/// This function constructs a graph with both a splitter and a merger, and returns a pair of the workloads. +template std::pair, std::unique_ptr> CreateSplitterMergerWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - static_assert(SplitterWorkload::ms_DataType == MergerWorkload::ms_DataType, - "Splitter and merger workloads must have the same data type"); + armnn::TensorInfo inputTensorInfo({ 1, 2, 100, 10 }, DataType); - armnn::TensorInfo inputTensorInfo({ 1, 2, 100, 10 }, SplitterWorkload::ms_DataType); + armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 10 }, DataType); + armnn::TensorInfo splitTensorInfo2({ 1, 1, 100, 10 }, DataType); - armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 10 }, SplitterWorkload::ms_DataType); - armnn::TensorInfo splitTensorInfo2({ 1, 1, 100, 10 }, SplitterWorkload::ms_DataType); - - //construct the graph + //Constructs the graph. Layer* const input = graph.AddLayer(0, "input"); armnn::ViewsDescriptor splitterViews(2); @@ -641,12 +735,12 @@ std::pair, std::unique_ptr> Layer* const output = graph.AddLayer(0, "output"); - // add connections + // Adds connections. Connect(input, splitter, inputTensorInfo, 0, 0); BOOST_TEST_CHECKPOINT("connect input to splitter"); - Connect(splitter, merger, splitTensorInfo1, 0, 1); // The splitter & merger are connected up + Connect(splitter, merger, splitTensorInfo1, 0, 1); // The splitter & merger are connected up. BOOST_TEST_CHECKPOINT("connect splitter[0] to merger[1]"); - Connect(splitter, merger, splitTensorInfo2, 1, 0); // so that the outputs are flipped round + Connect(splitter, merger, splitTensorInfo2, 1, 0); // So that the outputs are flipped round. BOOST_TEST_CHECKPOINT("connect splitter[1] to merger[0]"); Connect(merger, output, inputTensorInfo, 0, 0); BOOST_TEST_CHECKPOINT("connect merger to output"); @@ -665,7 +759,7 @@ std::pair, std::unique_ptr> /// This function constructs a graph with a splitter with two outputs. Each of the outputs is then /// connected to two different activation layers -template +template void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph, std::unique_ptr& wlSplitter, std::unique_ptr& wlActiv0_0, @@ -673,14 +767,11 @@ void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory& std::unique_ptr& wlActiv1_0, std::unique_ptr& wlActiv1_1) { - static_assert(SplitterWorkload::ms_DataType == ActivationWorkload::ms_DataType, - "Splitter and activation workloads must have the same data type"); - - armnn::TensorInfo inputTensorInfo ({ 1, 3, 100, 50 }, SplitterWorkload::ms_DataType); - armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 50 }, SplitterWorkload::ms_DataType); - armnn::TensorInfo splitTensorInfo2({ 1, 2, 100, 50 }, SplitterWorkload::ms_DataType); + armnn::TensorInfo inputTensorInfo ({ 1, 3, 100, 50 }, DataType); + armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 50 }, DataType); + armnn::TensorInfo splitTensorInfo2({ 1, 2, 100, 50 }, DataType); - //construct the graph + //Constructs the graph. Layer* const input = graph.AddLayer(0, "input"); armnn::ViewsDescriptor splitterViews(2); @@ -709,7 +800,7 @@ void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory& Layer* const output3 = graph.AddLayer(3, "output3"); Layer* const output4 = graph.AddLayer(4, "output4"); - // add connections + // Adds connections. Connect(input, splitter, inputTensorInfo, 0, 0); Connect(splitter, activ0_0, splitTensorInfo1, 0, 0); Connect(splitter, activ0_1, splitTensorInfo1, 0, 0); @@ -737,97 +828,155 @@ void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory& wlActiv1_1 = std::move(workloadActiv1_1); } -template +template std::unique_ptr CreateResizeBilinearWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. TensorShape outputShape({ 2, 3, 2, 2 }); ResizeBilinearDescriptor resizeDesc; resizeDesc.m_TargetWidth = outputShape[3]; resizeDesc.m_TargetHeight = outputShape[2]; Layer* const layer = graph.AddLayer(resizeDesc, "layer"); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - armnn::TensorInfo inputTensorInfo({ 2, 3, 4, 4 }, ResizeBilinearWorkload::ms_DataType); - armnn::TensorInfo outputTensorInfo(outputShape, ResizeBilinearWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo inputTensorInfo({ 2, 3, 4, 4 }, DataType); + armnn::TensorInfo outputTensorInfo(outputShape, DataType); Connect(input, layer, inputTensorInfo); Connect(layer, output, outputTensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); ResizeBilinearQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template +template std::unique_ptr CreateL2NormalizationWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Layer* const layer = graph.AddLayer("l2norm"); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - armnn::TensorInfo inputTensorInfo({ 5, 20, 50, 67 }, L2NormalizationWorkload::ms_DataType); - armnn::TensorInfo outputTensorInfo({ 5, 20, 50, 67 }, L2NormalizationWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo inputTensorInfo({ 5, 20, 50, 67 }, DataType); + armnn::TensorInfo outputTensorInfo({ 5, 20, 50, 67 }, DataType); Connect(input, layer, inputTensorInfo); Connect(layer, output, outputTensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); L2NormalizationQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template +template std::unique_ptr CreateReshapeWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. TensorShape outputShape({ 1, 4 }); ReshapeDescriptor reshapeDesc; reshapeDesc.m_TargetShape = outputShape; Layer* const layer = graph.AddLayer(reshapeDesc, "layer"); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer(0, "input"); Layer* const output = graph.AddLayer(0, "output"); - // connect up - armnn::TensorInfo inputTensorInfo({ 4, 1 }, ReshapeWorkload::ms_DataType); - armnn::TensorInfo outputTensorInfo(outputShape, ReshapeWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo inputTensorInfo({ 4, 1 }, DataType); + armnn::TensorInfo outputTensorInfo(outputShape, DataType); Connect(input, layer, inputTensorInfo); Connect(layer, output, outputTensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload(*layer, graph, factory); ReshapeQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. + return workload; +} + +template +std::unique_ptr CreateConvertFp16ToFp32WorkloadTest( + armnn::IWorkloadFactory& factory, armnn::Graph& graph) +{ + // Creates the layer we're testing. + ConvertFp16ToFp32Layer* const layer = graph.AddLayer("Fp16ToFp32Converter"); + + // Creates extra layers. + Layer* const input = graph.AddLayer(0, "input"); + Layer* const output = graph.AddLayer(0, "output"); + + // Connects up. + armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16); + armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32); + Connect(input, layer, inputTensorInfo); + Connect(layer, output, outputTensorInfo); + CreateTensorHandles(graph, factory); + + // Makes the workload and checks it. + auto workload = MakeAndCheckWorkload(*layer, graph, factory); + + ConvertFp16ToFp32QueueDescriptor queueDescriptor = workload->GetData(); + BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); + BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); + + // Returns so we can do extra, backend-specific tests. + return workload; +} + +template +std::unique_ptr CreateConvertFp32ToFp16WorkloadTest( + armnn::IWorkloadFactory& factory, armnn::Graph& graph) +{ + // Creates the layer we're testing. + ConvertFp32ToFp16Layer* const layer = graph.AddLayer("Fp32ToFp16Converter"); + + // Creates extra layers. + Layer* const input = graph.AddLayer(0, "input"); + Layer* const output = graph.AddLayer(0, "output"); + + // Connects up. + armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32); + armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16); + Connect(input, layer, inputTensorInfo); + Connect(layer, output, outputTensorInfo); + CreateTensorHandles(graph, factory); + + // Makes the workload and checks it. + auto workload = MakeAndCheckWorkload(*layer, graph, factory); + + ConvertFp32ToFp16QueueDescriptor queueDescriptor = workload->GetData(); + BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); + BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); + + // Returns so we can do extra, backend-specific tests. return workload; } diff --git a/src/armnn/test/CreateWorkloadClNeon.hpp b/src/armnn/test/CreateWorkloadClNeon.hpp index a41a70755f..d92111ac41 100644 --- a/src/armnn/test/CreateWorkloadClNeon.hpp +++ b/src/armnn/test/CreateWorkloadClNeon.hpp @@ -56,22 +56,21 @@ boost::test_tools::predicate_result CompareTensorHandleShape(IComputeTensorHandl return true; } -template